evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,153 @@
1
+ import base64
2
+ import os
3
+ import torch
4
+ from openai import OpenAI
5
+ from typing import List
6
+
7
+ from .vqa_model import VQAScoreModel
8
+
9
+ default_question_template = 'Does this figure show "{}"? Please answer yes or no.'
10
+ default_answer_template = 'Yes'
11
+
12
+ GPT4V_MODELS = {
13
+ # We recommend using 'gpt-4-turbo' for optimal performance.
14
+ 'gpt-4-turbo': {},
15
+ 'gpt-4o': {},
16
+ }
17
+
18
+
19
+ # Function to encode the image
20
+ def encode_image(image_path):
21
+ with open(image_path, 'rb') as image_file:
22
+ return base64.b64encode(image_file.read()).decode('utf-8')
23
+
24
+
25
+ def get_image_type(image_path):
26
+ image_type = image_path.split('.')[-1]
27
+ assert image_type in ['png', 'jpeg', 'jpg', 'gif', 'bmp', 'webp']
28
+ return image_type
29
+
30
+
31
+ class GPT4VModel(VQAScoreModel):
32
+ """A wrapper for the GPT4V models"""
33
+
34
+ def __init__(self, model_name='gpt-4-turbo', device='cuda', cache_dir=None, openai_key=None, top_logprobs=2):
35
+ assert model_name in GPT4V_MODELS
36
+ assert openai_key is not None, 'Please provide an OpenAI API key'
37
+ self.openai_key = openai_key
38
+ self.top_logprobs = top_logprobs
39
+ super().__init__(model_name=model_name, device=device, cache_dir=cache_dir)
40
+
41
+ def load_model(self):
42
+ """Load the model, tokenizer, image transform
43
+ """
44
+ import tiktoken
45
+
46
+ self.tokenizer = tiktoken.encoding_for_model(self.model_name)
47
+ self.client = OpenAI(api_key=self.openai_key)
48
+ # self.candidate_answers = GPT4V_MODELS[self.model_name]['candidate_answers']
49
+ # assert GPT4V_MODELS[self.model_name]['answer'] in self.candidate_answers
50
+ # self.candidate_tokens = []
51
+ # for ans in self.candidate_answers:
52
+ # token = self.tokenizer.encode(ans)
53
+ # assert len(token) == 1, "Currently only support single token answers"
54
+ # self.candidate_tokens.append(token[0])
55
+
56
+ def load_images(self, image: List[str]) -> torch.Tensor:
57
+ """Load the image(s), and return the string
58
+ """
59
+ image = [{'path': img, 'type': get_image_type(img), 'base64': encode_image(img)} for img in image]
60
+ return image
61
+
62
+ def forward_single(self, image, question, answer):
63
+ try:
64
+ completion = self.client.chat.completions.create(
65
+ model=self.model_name,
66
+ messages=[{
67
+ 'role':
68
+ 'user',
69
+ 'content': [{
70
+ 'type': 'text',
71
+ 'text': question
72
+ }, {
73
+ 'type': 'image_url',
74
+ 'image_url': {
75
+ 'url': f"data:image/{image['type']};base64,{image['base64']}"
76
+ }
77
+ }]
78
+ }],
79
+ logprobs=True,
80
+ top_logprobs=self.top_logprobs,
81
+ # logit_bias={yes_token:50, no_token:50}
82
+ )
83
+ except:
84
+ print(
85
+ f"Warning: completion not generated for image: {image['path']} and question: {question} and answer: {answer}"
86
+ )
87
+ print(f'Trying again with the same image')
88
+ try:
89
+ completion = self.client.chat.completions.create(
90
+ model=self.model_name,
91
+ messages=[{
92
+ 'role':
93
+ 'user',
94
+ 'content': [{
95
+ 'type': 'text',
96
+ 'text': question
97
+ }, {
98
+ 'type': 'image_url',
99
+ 'image_url': {
100
+ 'url': f"data:image/{image['type']};base64,{image['base64']}"
101
+ }
102
+ }]
103
+ }],
104
+ logprobs=True,
105
+ top_logprobs=self.top_logprobs,
106
+ )
107
+ except:
108
+ print(f"Failed image: {image['path']} and question: {question} and answer: {answer}")
109
+ return torch.Tensor([0.0])
110
+
111
+ # print(completion.choices[0].message)
112
+ # print(completion.choices[0].logprobs)
113
+ # print(completion.choices[0].logprobs.content[0])
114
+ is_generated = False
115
+ for top_logprob in completion.choices[0].logprobs.content[0].top_logprobs:
116
+ if top_logprob.token == answer:
117
+ is_generated = True
118
+ return torch.Tensor([top_logprob.logprob]).exp()
119
+ if not is_generated:
120
+ print(
121
+ f"Warning: answer not generated for image: {image['path']} and question: {question} and answer: {answer}"
122
+ )
123
+ print(completion.choices[0].logprobs.content[0].top_logprobs)
124
+ return torch.Tensor([0.0])
125
+
126
+ def forward(
127
+ self,
128
+ images: List[str],
129
+ texts: List[str],
130
+ question_template: str = default_question_template,
131
+ answer_template: str = default_answer_template
132
+ ) -> torch.Tensor:
133
+ """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
134
+ """
135
+ assert len(images) == len(texts), 'Number of images and texts must match'
136
+ # Turn "a photo of a dog" into
137
+ # Q: "Does this figure show "a photo of a dog"? Please answer yes or no."
138
+ # A: "Yes"
139
+ questions = [question_template.format(text) for text in texts]
140
+ answers = [answer_template.format(text) for text in texts]
141
+
142
+ for ans in answers:
143
+ ans_tokens = self.tokenizer.encode(ans)
144
+ assert len(ans_tokens) == 1, 'Currently only support single token answers'
145
+
146
+ images = self.load_images(images)
147
+
148
+ lm_prob = torch.zeros(len(images))
149
+
150
+ for idx, (image, question, answer) in enumerate(zip(images, questions, answers)):
151
+ lm_prob[idx] = self.forward_single(image, question, answer)
152
+
153
+ return lm_prob
@@ -0,0 +1,26 @@
1
+ """
2
+ Copyright (c) 2022, salesforce.com, inc.
3
+ All rights reserved.
4
+ SPDX-License-Identifier: BSD-3-Clause
5
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ from omegaconf import OmegaConf
11
+
12
+ from .common.registry import registry
13
+ from .models import *
14
+ from .processors import *
15
+
16
+ root_dir = os.path.dirname(os.path.abspath(__file__))
17
+ default_cfg = OmegaConf.load(os.path.join(root_dir, 'configs/default.yaml'))
18
+
19
+ registry.register_path('library_root', root_dir)
20
+ repo_root = os.path.join(root_dir, '..')
21
+ registry.register_path('repo_root', repo_root)
22
+ cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
23
+ registry.register_path('cache_root', cache_root)
24
+
25
+ registry.register('MAX_INT', sys.maxsize)
26
+ registry.register('SPLIT_NAMES', ['train', 'val', 'test'])
@@ -0,0 +1,465 @@
1
+ """
2
+ Copyright (c) 2022, salesforce.com, inc.
3
+ All rights reserved.
4
+ SPDX-License-Identifier: BSD-3-Clause
5
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ from omegaconf import OmegaConf
11
+ from typing import Dict
12
+
13
+ from ..common.registry import registry
14
+
15
+
16
+ class Config:
17
+
18
+ def __init__(self, args):
19
+ self.config = {}
20
+
21
+ self.args = args
22
+
23
+ # Register the config and configuration for setup
24
+ registry.register('configuration', self)
25
+
26
+ user_config = self._build_opt_list(self.args.options)
27
+
28
+ config = OmegaConf.load(self.args.cfg_path)
29
+
30
+ runner_config = self.build_runner_config(config)
31
+ model_config = self.build_model_config(config, **user_config)
32
+ dataset_config = self.build_dataset_config(config)
33
+
34
+ # Validate the user-provided runner configuration
35
+ # model and dataset configuration are supposed to be validated by the respective classes
36
+ # [TODO] validate the model/dataset configuration
37
+ # self._validate_runner_config(runner_config)
38
+
39
+ # Override the default configuration with user options.
40
+ self.config = OmegaConf.merge(runner_config, model_config, dataset_config, user_config)
41
+
42
+ def _validate_runner_config(self, runner_config):
43
+ """
44
+ This method validates the configuration, such that
45
+ 1) all the user specified options are valid;
46
+ 2) no type mismatches between the user specified options and the config.
47
+ """
48
+ runner_config_validator = create_runner_config_validator()
49
+ runner_config_validator.validate(runner_config)
50
+
51
+ def _build_opt_list(self, opts):
52
+ opts_dot_list = self._convert_to_dot_list(opts)
53
+ return OmegaConf.from_dotlist(opts_dot_list)
54
+
55
+ @staticmethod
56
+ def build_model_config(config, **kwargs):
57
+ model = config.get('model', None)
58
+ assert model is not None, 'Missing model configuration file.'
59
+
60
+ model_cls = registry.get_model_class(model.arch)
61
+ assert model_cls is not None, f"Model '{model.arch}' has not been registered."
62
+
63
+ model_type = kwargs.get('model.model_type', None)
64
+ if not model_type:
65
+ model_type = model.get('model_type', None)
66
+ # else use the model type selected by user.
67
+
68
+ assert model_type is not None, 'Missing model_type.'
69
+
70
+ model_config_path = model_cls.default_config_path(model_type=model_type)
71
+
72
+ model_config = OmegaConf.create()
73
+ # hiararchy override, customized config > default config
74
+ model_config = OmegaConf.merge(
75
+ model_config,
76
+ OmegaConf.load(model_config_path),
77
+ {'model': config['model']},
78
+ )
79
+
80
+ return model_config
81
+
82
+ @staticmethod
83
+ def build_runner_config(config):
84
+ return {'run': config.run}
85
+
86
+ @staticmethod
87
+ def build_dataset_config(config):
88
+ datasets = config.get('datasets', None)
89
+ if datasets is None:
90
+ raise KeyError("Expecting 'datasets' as the root key for dataset configuration.")
91
+
92
+ dataset_config = OmegaConf.create()
93
+
94
+ for dataset_name in datasets:
95
+ builder_cls = registry.get_builder_class(dataset_name)
96
+
97
+ dataset_config_type = datasets[dataset_name].get('type', 'default')
98
+ dataset_config_path = builder_cls.default_config_path(type=dataset_config_type)
99
+
100
+ # hiararchy override, customized config > default config
101
+ dataset_config = OmegaConf.merge(
102
+ dataset_config,
103
+ OmegaConf.load(dataset_config_path),
104
+ {'datasets': {
105
+ dataset_name: config['datasets'][dataset_name]
106
+ }},
107
+ )
108
+
109
+ return dataset_config
110
+
111
+ def _convert_to_dot_list(self, opts):
112
+ if opts is None:
113
+ opts = []
114
+
115
+ if len(opts) == 0:
116
+ return opts
117
+
118
+ has_equal = opts[0].find('=') != -1
119
+
120
+ if has_equal:
121
+ return opts
122
+
123
+ return [(opt + '=' + value) for opt, value in zip(opts[0::2], opts[1::2])]
124
+
125
+ def get_config(self):
126
+ return self.config
127
+
128
+ @property
129
+ def run_cfg(self):
130
+ return self.config.run
131
+
132
+ @property
133
+ def datasets_cfg(self):
134
+ return self.config.datasets
135
+
136
+ @property
137
+ def model_cfg(self):
138
+ return self.config.model
139
+
140
+ def pretty_print(self):
141
+ logging.info('\n===== Running Parameters =====')
142
+ logging.info(self._convert_node_to_json(self.config.run))
143
+
144
+ logging.info('\n====== Dataset Attributes ======')
145
+ datasets = self.config.datasets
146
+
147
+ for dataset in datasets:
148
+ if dataset in self.config.datasets:
149
+ logging.info(f'\n======== {dataset} =======')
150
+ dataset_config = self.config.datasets[dataset]
151
+ logging.info(self._convert_node_to_json(dataset_config))
152
+ else:
153
+ logging.warning(f"No dataset named '{dataset}' in config. Skipping")
154
+
155
+ logging.info(f'\n====== Model Attributes ======')
156
+ logging.info(self._convert_node_to_json(self.config.model))
157
+
158
+ def _convert_node_to_json(self, node):
159
+ container = OmegaConf.to_container(node, resolve=True)
160
+ return json.dumps(container, indent=4, sort_keys=True)
161
+
162
+ def to_dict(self):
163
+ return OmegaConf.to_container(self.config)
164
+
165
+
166
+ def node_to_dict(node):
167
+ return OmegaConf.to_container(node)
168
+
169
+
170
+ class ConfigValidator:
171
+ """
172
+ This is a preliminary implementation to centralize and validate the configuration.
173
+ May be altered in the future.
174
+
175
+ A helper class to validate configurations from yaml file.
176
+
177
+ This serves the following purposes:
178
+ 1. Ensure all the options in the yaml are defined, raise error if not.
179
+ 2. when type mismatches are found, the validator will raise an error.
180
+ 3. a central place to store and display helpful messages for supported configurations.
181
+
182
+ """
183
+
184
+ class _Argument:
185
+
186
+ def __init__(self, name, choices=None, type=None, help=None):
187
+ self.name = name
188
+ self.val = None
189
+ self.choices = choices
190
+ self.type = type
191
+ self.help = help
192
+
193
+ def __str__(self):
194
+ s = f'{self.name}={self.val}'
195
+ if self.type is not None:
196
+ s += f', ({self.type})'
197
+ if self.choices is not None:
198
+ s += f', choices: {self.choices}'
199
+ if self.help is not None:
200
+ s += f', ({self.help})'
201
+ return s
202
+
203
+ def __init__(self, description):
204
+ self.description = description
205
+
206
+ self.arguments = dict()
207
+
208
+ self.parsed_args = None
209
+
210
+ def __getitem__(self, key):
211
+ assert self.parsed_args is not None, 'No arguments parsed yet.'
212
+
213
+ return self.parsed_args[key]
214
+
215
+ def __str__(self) -> str:
216
+ return self.format_help()
217
+
218
+ def add_argument(self, *args, **kwargs):
219
+ """
220
+ Assume the first argument is the name of the argument.
221
+ """
222
+ self.arguments[args[0]] = self._Argument(*args, **kwargs)
223
+
224
+ def validate(self, config=None):
225
+ """
226
+ Convert yaml config (dict-like) to list, required by argparse.
227
+ """
228
+ for k, v in config.items():
229
+ assert (
230
+ k in self.arguments
231
+ ), f"""{k} is not a valid argument. Support arguments are {self.format_arguments()}."""
232
+
233
+ if self.arguments[k].type is not None:
234
+ try:
235
+ self.arguments[k].val = self.arguments[k].type(v)
236
+ except ValueError:
237
+ raise ValueError(f'{k} is not a valid {self.arguments[k].type}.')
238
+
239
+ if self.arguments[k].choices is not None:
240
+ assert (v in self.arguments[k].choices), f"""{k} must be one of {self.arguments[k].choices}."""
241
+
242
+ return config
243
+
244
+ def format_arguments(self):
245
+ return str([f'{k}' for k in sorted(self.arguments.keys())])
246
+
247
+ def format_help(self):
248
+ # description + key-value pair string for each argument
249
+ help_msg = str(self.description)
250
+ return help_msg + ', available arguments: ' + self.format_arguments()
251
+
252
+ def print_help(self):
253
+ # display help message
254
+ print(self.format_help())
255
+
256
+
257
+ def create_runner_config_validator():
258
+ validator = ConfigValidator(description='Runner configurations')
259
+
260
+ validator.add_argument(
261
+ 'runner',
262
+ type=str,
263
+ choices=['runner_base', 'runner_iter'],
264
+ help="""Runner to use. The "runner_base" uses epoch-based training while iter-based
265
+ runner runs based on iters. Default: runner_base""",
266
+ )
267
+ # add argumetns for training dataset ratios
268
+ validator.add_argument(
269
+ 'train_dataset_ratios',
270
+ type=Dict[str, float],
271
+ help="""Ratios of training dataset. This is used in iteration-based runner.
272
+ Do not support for epoch-based runner because how to define an epoch becomes tricky.
273
+ Default: None""",
274
+ )
275
+ validator.add_argument(
276
+ 'max_iters',
277
+ type=float,
278
+ help='Maximum number of iterations to run.',
279
+ )
280
+ validator.add_argument(
281
+ 'max_epoch',
282
+ type=int,
283
+ help='Maximum number of epochs to run.',
284
+ )
285
+ # add arguments for iters_per_inner_epoch
286
+ validator.add_argument(
287
+ 'iters_per_inner_epoch',
288
+ type=float,
289
+ help='Number of iterations per inner epoch. This is required when runner is runner_iter.',
290
+ )
291
+ lr_scheds_choices = registry.list_lr_schedulers()
292
+ validator.add_argument(
293
+ 'lr_sched',
294
+ type=str,
295
+ choices=lr_scheds_choices,
296
+ help='Learning rate scheduler to use, from {}'.format(lr_scheds_choices),
297
+ )
298
+ task_choices = registry.list_tasks()
299
+ validator.add_argument(
300
+ 'task',
301
+ type=str,
302
+ choices=task_choices,
303
+ help='Task to use, from {}'.format(task_choices),
304
+ )
305
+ # add arguments for init_lr
306
+ validator.add_argument(
307
+ 'init_lr',
308
+ type=float,
309
+ help='Initial learning rate. This will be the learning rate after warmup and before decay.',
310
+ )
311
+ # add arguments for min_lr
312
+ validator.add_argument(
313
+ 'min_lr',
314
+ type=float,
315
+ help='Minimum learning rate (after decay).',
316
+ )
317
+ # add arguments for warmup_lr
318
+ validator.add_argument(
319
+ 'warmup_lr',
320
+ type=float,
321
+ help='Starting learning rate for warmup.',
322
+ )
323
+ # add arguments for learning rate decay rate
324
+ validator.add_argument(
325
+ 'lr_decay_rate',
326
+ type=float,
327
+ help='Learning rate decay rate. Required if using a decaying learning rate scheduler.',
328
+ )
329
+ # add arguments for weight decay
330
+ validator.add_argument(
331
+ 'weight_decay',
332
+ type=float,
333
+ help='Weight decay rate.',
334
+ )
335
+ # add arguments for training batch size
336
+ validator.add_argument(
337
+ 'batch_size_train',
338
+ type=int,
339
+ help='Training batch size.',
340
+ )
341
+ # add arguments for evaluation batch size
342
+ validator.add_argument(
343
+ 'batch_size_eval',
344
+ type=int,
345
+ help='Evaluation batch size, including validation and testing.',
346
+ )
347
+ # add arguments for number of workers for data loading
348
+ validator.add_argument(
349
+ 'num_workers',
350
+ help='Number of workers for data loading.',
351
+ )
352
+ # add arguments for warm up steps
353
+ validator.add_argument(
354
+ 'warmup_steps',
355
+ type=int,
356
+ help='Number of warmup steps. Required if a warmup schedule is used.',
357
+ )
358
+ # add arguments for random seed
359
+ validator.add_argument(
360
+ 'seed',
361
+ type=int,
362
+ help='Random seed.',
363
+ )
364
+ # add arguments for output directory
365
+ validator.add_argument(
366
+ 'output_dir',
367
+ type=str,
368
+ help='Output directory to save checkpoints and logs.',
369
+ )
370
+ # add arguments for whether only use evaluation
371
+ validator.add_argument(
372
+ 'evaluate',
373
+ help='Whether to only evaluate the model. If true, training will not be performed.',
374
+ )
375
+ # add arguments for splits used for training, e.g. ["train", "val"]
376
+ validator.add_argument(
377
+ 'train_splits',
378
+ type=list,
379
+ help='Splits to use for training.',
380
+ )
381
+ # add arguments for splits used for validation, e.g. ["val"]
382
+ validator.add_argument(
383
+ 'valid_splits',
384
+ type=list,
385
+ help='Splits to use for validation. If not provided, will skip the validation.',
386
+ )
387
+ # add arguments for splits used for testing, e.g. ["test"]
388
+ validator.add_argument(
389
+ 'test_splits',
390
+ type=list,
391
+ help='Splits to use for testing. If not provided, will skip the testing.',
392
+ )
393
+ # add arguments for accumulating gradient for iterations
394
+ validator.add_argument(
395
+ 'accum_grad_iters',
396
+ type=int,
397
+ help='Number of iterations to accumulate gradient for.',
398
+ )
399
+
400
+ # ====== distributed training ======
401
+ validator.add_argument(
402
+ 'device',
403
+ type=str,
404
+ choices=['cpu', 'cuda'],
405
+ help="Device to use. Support 'cuda' or 'cpu' as for now.",
406
+ )
407
+ validator.add_argument(
408
+ 'world_size',
409
+ type=int,
410
+ help='Number of processes participating in the job.',
411
+ )
412
+ validator.add_argument('dist_url', type=str)
413
+ validator.add_argument('distributed', type=bool)
414
+ # add arguments to opt using distributed sampler during evaluation or not
415
+ validator.add_argument(
416
+ 'use_dist_eval_sampler',
417
+ type=bool,
418
+ help='Whether to use distributed sampler during evaluation or not.',
419
+ )
420
+
421
+ # ====== task specific ======
422
+ # generation task specific arguments
423
+ # add arguments for maximal length of text output
424
+ validator.add_argument(
425
+ 'max_len',
426
+ type=int,
427
+ help='Maximal length of text output.',
428
+ )
429
+ # add arguments for minimal length of text output
430
+ validator.add_argument(
431
+ 'min_len',
432
+ type=int,
433
+ help='Minimal length of text output.',
434
+ )
435
+ # add arguments number of beams
436
+ validator.add_argument(
437
+ 'num_beams',
438
+ type=int,
439
+ help='Number of beams used for beam search.',
440
+ )
441
+
442
+ # vqa task specific arguments
443
+ # add arguments for number of answer candidates
444
+ validator.add_argument(
445
+ 'num_ans_candidates',
446
+ type=int,
447
+ help=
448
+ """For ALBEF and BLIP, these models first rank answers according to likelihood to select answer candidates.""",
449
+ )
450
+ # add arguments for inference method
451
+ validator.add_argument(
452
+ 'inference_method',
453
+ type=str,
454
+ choices=['genearte', 'rank'],
455
+ help="""Inference method to use for question answering. If rank, requires a answer list.""",
456
+ )
457
+
458
+ # ====== model specific ======
459
+ validator.add_argument(
460
+ 'k_test',
461
+ type=int,
462
+ help='Number of top k most similar samples from ITC/VTC selection to be tested.',
463
+ )
464
+
465
+ return validator