evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,611 @@
1
+ import json
2
+ import numpy as np
3
+ import os
4
+ from collections import defaultdict
5
+ from typing import Dict, List
6
+
7
+ from evalscope.api.metric import Aggregator, AggScore, Metric, SampleScore, SingletonMetric, T2IMetric
8
+ from evalscope.api.registry import register_aggregation, register_metric
9
+ from evalscope.utils.import_utils import check_import
10
+ from .metrics import calculate_pass_at_k, calculate_pass_hat_k, mean, normalize_text
11
+
12
+ # ##################
13
+ # NLP Metrics ######
14
+ # ##################
15
+
16
+
17
+ @register_metric(name='exact_match')
18
+ class ExactMatch(Metric):
19
+
20
+ def apply(self, predictions, references):
21
+ return [
22
+ float(normalize_text(prediction) == normalize_text(reference))
23
+ for prediction, reference in zip(predictions, references)
24
+ ]
25
+
26
+
27
+ @register_metric(name='acc')
28
+ class Accuracy(ExactMatch):
29
+
30
+ def __init__(self, allow_inclusion: bool = False, numeric: bool = False):
31
+ self.allow_inclusion = allow_inclusion
32
+ self.numeric = numeric
33
+
34
+ def apply(self, predictions, references):
35
+ if self.allow_inclusion:
36
+ results = []
37
+ for prediction, reference in zip(predictions, references):
38
+ if prediction and prediction in reference:
39
+ results.append(1.0)
40
+ else:
41
+ results.append(0.0)
42
+ return results
43
+ elif self.numeric:
44
+ from .math_parser import math_equal, strip_answer_string
45
+
46
+ results = []
47
+ for prediction, reference in zip(predictions, references):
48
+ ref_answer = strip_answer_string(reference)
49
+ results.append(float(math_equal(prediction, ref_answer)))
50
+
51
+ return results
52
+ else:
53
+ return super().apply(predictions, references)
54
+
55
+
56
+ @register_metric(name='numeric_match')
57
+ class NumericMatch(Metric):
58
+
59
+ def apply(self, predictions, references):
60
+ return [float(prediction == reference) for prediction, reference in zip(predictions, references)]
61
+
62
+
63
+ @register_metric(name='math_acc')
64
+ class MathAcc(Metric):
65
+
66
+ def apply(self, predictions, references):
67
+ from .math_parser import extract_answer, math_equal, strip_answer_string
68
+
69
+ results = []
70
+ for prediction, reference in zip(predictions, references):
71
+ pred_answer = strip_answer_string(extract_answer(prediction))
72
+ ref_answer = strip_answer_string(reference)
73
+ results.append(float(math_equal(pred_answer, ref_answer)))
74
+
75
+ return results
76
+
77
+
78
+ @register_metric(name='multi_choice_acc')
79
+ class MultiChoiceAcc(Metric):
80
+
81
+ def apply(self, predictions, references):
82
+ """
83
+ Calculate accuracy for multiple-choice questions.
84
+
85
+ Args:
86
+ predictions (List[str]): List of predicted answers.
87
+ references (List[str]): List of correct answers.
88
+
89
+ Returns:
90
+ List[float]: List of accuracy scores (1.0 for correct, 0.0 for incorrect).
91
+ """
92
+ res = []
93
+ for prediction, reference in zip(predictions, references):
94
+ prediction = set(prediction.strip().upper())
95
+ reference = set(reference.strip().upper())
96
+ # if the prediction has answer that not in reference, it is wrong
97
+ if not prediction.issubset(reference):
98
+ res.append(0.0)
99
+ continue
100
+ common = prediction.intersection(reference)
101
+ res.append(len(common) / len(reference) if reference else 0.0)
102
+ return res
103
+
104
+
105
+ @register_metric(name='anls')
106
+ class ANLS(Metric):
107
+
108
+ def __init__(self, thresh_hold=0.5):
109
+ self.thresh_hold = thresh_hold
110
+
111
+ def apply(self, predictions, references):
112
+ """
113
+ Calculate ANLS (Average Normalized Levenshtein Similarity) for a list of predictions and references.
114
+ This implementation is adapted from
115
+ https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/infographicsvqa_eval.py
116
+
117
+ Args:
118
+ references (List[str]): List of correct answers. Each answer can be a string of json.
119
+ predictions (List[str]): List of predicted answers.
120
+ """
121
+ from .metrics import levenshtein_distance
122
+
123
+ res = []
124
+ # Unwrap predictions if it's a nested list
125
+ for prediction, reference in zip(predictions, references):
126
+ # Parse the reference which is a json string
127
+ try:
128
+ answer = json.loads(reference)
129
+ except json.JSONDecodeError:
130
+ answer = reference
131
+ if isinstance(answer, str):
132
+ answer = [answer]
133
+ assert isinstance(answer, list), 'The reference answer should be a list of answers.'
134
+
135
+ # Calculate ANLS for each reference answer
136
+ values = []
137
+ for ans in answer:
138
+ # preprocess both the answers - gt and prediction
139
+ gt_answer = ' '.join(ans.strip().lower().split())
140
+ det_answer = ' '.join(prediction.strip().lower().split())
141
+
142
+ dist = levenshtein_distance(gt_answer, det_answer)
143
+ length = max(len(ans.upper()), len(prediction.upper()))
144
+ values.append(0.0 if length == 0 else float(dist) / float(length))
145
+
146
+ question_result = 0.0
147
+ if values:
148
+ question_result = 1 - min(values)
149
+ if question_result < self.thresh_hold:
150
+ question_result = 0.0
151
+ res.append(question_result)
152
+ return res
153
+
154
+
155
+ @register_metric(name='bertscore')
156
+ class BertScore(SingletonMetric):
157
+
158
+ def _init_once(self, model_id_or_path: str = 'google-bert/bert-base-chinese', **kwargs):
159
+ """BertScore metric.
160
+
161
+ Args:
162
+ model_id_or_path (str, optional): The model ID on modelscope or path to the pre-trained model.
163
+ Defaults to 'google-bert/bert-base-chinese'.
164
+ """
165
+ check_import('torch', 'torch', raise_error=True, feature_name='BertScore Metric')
166
+
167
+ from .bert_score.scorer import BERTScorer
168
+ self.scorer = BERTScorer(model_id_or_path=model_id_or_path, batch_size=1024, **kwargs)
169
+
170
+ def apply(self, predictions: List[str], references: List[str]) -> List[float]:
171
+ _, _, F1 = self.scorer.score(predictions, references)
172
+ return [round(f1.item(), 6) for f1 in F1]
173
+
174
+
175
+ @register_metric(name='comet')
176
+ class COMETScore(SingletonMetric):
177
+
178
+ def _init_once(self, model_id_or_path: str = 'evalscope/wmt22-comet-da'):
179
+ """COMETScore metric.
180
+
181
+ Args:
182
+ model_name (str, optional): The model name on huggingface.
183
+ Defaults to 'evalscope/wmt22-comet-da'.
184
+ """
185
+ check_import('comet', 'unbabel-comet', raise_error=True, feature_name='COMETScore Metric')
186
+
187
+ from comet import load_from_checkpoint
188
+ from modelscope import snapshot_download
189
+
190
+ self.model_name = model_id_or_path
191
+ model_path = snapshot_download(model_id_or_path)
192
+ checkpoint_path = os.path.join(model_path, 'checkpoints', 'model.ckpt')
193
+ self.comet_scorer = load_from_checkpoint(checkpoint_path)
194
+
195
+ def apply(self, samples: List[Dict[str, str]]) -> List[float]:
196
+ """Apply COMET scoring."""
197
+ import torch
198
+
199
+ model_output = self.comet_scorer.predict(
200
+ samples=samples,
201
+ batch_size=1024,
202
+ gpus=1 if torch.cuda.is_available() else 0,
203
+ progress_bar=False,
204
+ )
205
+ scores = model_output.scores if hasattr(model_output, 'scores') else [model_output.system_score] * len(samples)
206
+
207
+ return [round(score, 6) for score in scores]
208
+
209
+
210
+ # ##################
211
+ # T2I Metrics ######
212
+ # ##################
213
+ @register_metric(name='VQAScore')
214
+ class VQAScore(T2IMetric):
215
+
216
+ def _init_once(self, model: str = 'clip-flant5-xxl'):
217
+ from .t2v_metrics.vqascore import VQAScore
218
+ self.model = VQAScore(model=model)
219
+
220
+ def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
221
+ return self.model(images, texts, **kwargs)
222
+
223
+
224
+ @register_metric(name='PickScore')
225
+ class PickScore(T2IMetric):
226
+
227
+ def _init_once(self, model: str = 'pickscore-v1'):
228
+ from .t2v_metrics.clipscore import CLIPScore
229
+ self.model = CLIPScore(model=model)
230
+
231
+ def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
232
+ return self.model(images, texts, **kwargs)
233
+
234
+
235
+ @register_metric(name='CLIPScore')
236
+ class CLIPScore(T2IMetric):
237
+
238
+ def _init_once(self, model: str = 'openai:ViT-L-14-336'):
239
+ from .t2v_metrics.clipscore import CLIPScore
240
+ self.model = CLIPScore(model=model)
241
+
242
+ def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
243
+ return self.model(images, texts, **kwargs)
244
+
245
+
246
+ @register_metric(name='BLIPv2Score')
247
+ class BLIPv2Score(T2IMetric):
248
+
249
+ def _init_once(self, model: str = 'blip2-itm'):
250
+ from .t2v_metrics.itmscore import ITMScore
251
+ self.model = ITMScore(model=model)
252
+
253
+ def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
254
+ return self.model(images, texts, **kwargs)
255
+
256
+
257
+ @register_metric(name='HPSv2Score')
258
+ class HPSv2Score(T2IMetric):
259
+
260
+ def _init_once(self, model: str = 'hpsv2'):
261
+ from .t2v_metrics.clipscore import CLIPScore
262
+ self.model = CLIPScore(model=model)
263
+
264
+ def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
265
+ return self.model(images, texts, **kwargs)
266
+
267
+
268
+ @register_metric(name='HPSv2.1Score')
269
+ class HPSv2_1Score(T2IMetric):
270
+
271
+ def _init_once(self, model: str = 'hpsv2.1'):
272
+ from .t2v_metrics.clipscore import CLIPScore
273
+ self.model = CLIPScore(model=model)
274
+
275
+ def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
276
+ return self.model(images, texts, **kwargs)
277
+
278
+
279
+ @register_metric(name='ImageRewardScore')
280
+ class ImageRewardScore(T2IMetric):
281
+
282
+ def _init_once(self, model: str = 'image-reward-v1'):
283
+ from .t2v_metrics.itmscore import ITMScore
284
+ self.model = ITMScore(model=model)
285
+
286
+ def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
287
+ return self.model(images, texts, **kwargs)
288
+
289
+
290
+ @register_metric(name='FGA_BLIP2Score')
291
+ class FGA_BLIP2Score(T2IMetric):
292
+
293
+ def _init_once(self, model: str = 'fga_blip2'):
294
+ from .t2v_metrics.itmscore import ITMScore
295
+ self.model = ITMScore(model=model)
296
+
297
+ def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
298
+ return self.model(images, texts, **kwargs)
299
+
300
+
301
+ @register_metric(name='MPS')
302
+ class MPS(T2IMetric):
303
+
304
+ def _init_once(self, model: str = 'mps'):
305
+ from .t2v_metrics.clipscore import CLIPScore
306
+ self.model = CLIPScore(model=model)
307
+
308
+ def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
309
+ return self.model(images, texts, **kwargs)
310
+
311
+
312
+ # ##################
313
+ # Aggregators ######
314
+ # ##################
315
+ @register_aggregation(name='mean')
316
+ class Mean(Aggregator):
317
+
318
+ name = 'mean'
319
+
320
+ def agg_func(self, values: List[float]) -> float:
321
+ return mean(values)
322
+
323
+ def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
324
+ """Aggregate scores by computing the mean for each metric.
325
+
326
+ Args:
327
+ scores: List of sample scores to aggregate
328
+
329
+ Returns:
330
+ List of aggregated scores with mean values
331
+ """
332
+ if not scores:
333
+ return []
334
+
335
+ # Group score values by metric name
336
+ metric_values = defaultdict(list)
337
+ metric_sample_ids = defaultdict(list)
338
+
339
+ for score in scores:
340
+
341
+ for metric_name, value in score.score.value.items():
342
+ metric_values[metric_name].append(value)
343
+ metric_sample_ids[metric_name].append(score.sample_id)
344
+
345
+ # Calculate mean for each metric
346
+ aggregated_scores = []
347
+ for metric_name, values in metric_values.items():
348
+ if values: # Only process non-empty value lists
349
+ aggregated_scores.append(
350
+ AggScore(
351
+ score=self.agg_func(values),
352
+ metric_name=metric_name,
353
+ aggregation_name=self.name,
354
+ num=len(values),
355
+ ids=metric_sample_ids[metric_name]
356
+ )
357
+ )
358
+
359
+ return aggregated_scores
360
+
361
+
362
+ @register_aggregation(name='clipped_mean')
363
+ class ClippedMean(Mean):
364
+
365
+ name = 'clipped_mean'
366
+
367
+ def __init__(self, clip_min: float = 0.0, clip_max: float = 1.0):
368
+ self.clip_min = clip_min
369
+ self.clip_max = clip_max
370
+
371
+ def agg_func(self, values: List[float]) -> float:
372
+ clipped_values = min(max(mean(values), self.clip_min), self.clip_max)
373
+ return clipped_values
374
+
375
+
376
+ @register_aggregation(name='pass_at_k')
377
+ class PassAtK(Aggregator):
378
+
379
+ def __init__(self, k: int = 1):
380
+ self.k = k
381
+ self.name = f'pass_at_{k}'
382
+
383
+ def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
384
+ """Aggregate scores by computing the pass@k for each metric using group_id.
385
+
386
+ Args:
387
+ scores: List of sample scores to aggregate
388
+
389
+ Returns:
390
+ List of aggregated scores with pass@k values
391
+ """
392
+ if not scores:
393
+ return []
394
+
395
+ # Group scores by metric name and group_id
396
+ metric_groups = defaultdict(lambda: defaultdict(list))
397
+
398
+ for score in scores:
399
+ group_id = getattr(score, 'group_id', score.sample_id) # fallback to sample_id if no group_id
400
+
401
+ for metric_name, value in score.score.value.items():
402
+ metric_groups[metric_name][group_id].append(float(value))
403
+
404
+ # Calculate pass@k for each metric
405
+ aggregated_scores = []
406
+ for metric_name, groups in metric_groups.items():
407
+ if not groups:
408
+ continue
409
+
410
+ # Calculate pass@k for each group (problem)
411
+ num_samples = []
412
+ num_correct = []
413
+ all_sample_ids = []
414
+
415
+ for group_id, group_values in groups.items():
416
+ num_samples.append(len(group_values))
417
+ num_correct.append(sum(group_values)) # count how many passed in this group
418
+ all_sample_ids.extend([f'{group_id}_{i}' for i in range(len(group_values))])
419
+
420
+ if num_samples:
421
+ # Use the calculate_pass_at_k function from metrics
422
+ pass_at_k_values = calculate_pass_at_k(num_samples, num_correct, self.k)
423
+ overall_pass_at_k = float(np.mean(pass_at_k_values))
424
+
425
+ aggregated_scores.append(
426
+ AggScore(
427
+ score=overall_pass_at_k,
428
+ metric_name=f'pass@{self.k}',
429
+ aggregation_name='',
430
+ num=len(scores),
431
+ ids=all_sample_ids
432
+ )
433
+ )
434
+
435
+ return aggregated_scores
436
+
437
+
438
+ @register_aggregation(name='mean_and_pass_at_k')
439
+ class MeanPassAtK(Aggregator):
440
+
441
+ def __init__(self):
442
+ self.name = 'mean_and_pass_at_k'
443
+
444
+ def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
445
+ """Add per-metric pass@k (computed via calculate_pass_at_k) to each sample, then mean-aggregate.
446
+
447
+ For each metric:
448
+ - Group scores by group_id
449
+ - Collect binary correctness values
450
+ - Infer k as (total samples / number of groups) assuming uniform repetitions
451
+ - Compute per-group pass@k via calculate_pass_at_k
452
+ - Annotate each sample with metric_pass@k for its group
453
+ Finally run Mean() over the augmented metric set.
454
+ """
455
+ if not scores:
456
+ return []
457
+
458
+ # Extract metric names present in score values
459
+ metrics = list(scores[0].score.value.keys())
460
+
461
+ for metric_name in metrics:
462
+ # group_id -> list[float] (0/1 correctness values)
463
+ group_values: Dict[str, List[float]] = defaultdict(list)
464
+ for s in scores:
465
+ group_id = getattr(s, 'group_id', s.sample_id)
466
+ value = float(s.score.value[metric_name])
467
+ group_values[group_id].append(value)
468
+
469
+ if not group_values:
470
+ continue
471
+
472
+ # Infer k (assumes roughly uniform repeats)
473
+ k = int(len(scores) / len(group_values)) if len(group_values) > 0 else 1
474
+ if k <= 0:
475
+ k = 1
476
+
477
+ # Prepare inputs for calculate_pass_at_k
478
+ num_samples: List[int] = []
479
+ num_correct: List[int] = []
480
+ group_order: List[str] = []
481
+ for gid, vals in group_values.items():
482
+ group_order.append(gid)
483
+ num_samples.append(len(vals))
484
+ num_correct.append(int(sum(vals)))
485
+
486
+ # Compute per-group pass@k
487
+ pass_at_k_list = calculate_pass_at_k(num_samples, num_correct, k)
488
+ # Map back: group_id -> pass@k value
489
+ pass_at_k_map = {gid: float(v) for gid, v in zip(group_order, pass_at_k_list)}
490
+
491
+ # Annotate each sample with its group's pass@k
492
+ for s in scores:
493
+ group_id = getattr(s, 'group_id', s.sample_id)
494
+ s.score.value[f'{metric_name}_pass@{k}'] = pass_at_k_map[group_id]
495
+
496
+ # Delegate mean aggregation over original + injected pass@k metrics
497
+ m = Mean()
498
+ return m(scores)
499
+
500
+
501
+ @register_aggregation(name='mean_and_vote_at_k')
502
+ class MeanVoteAtK(Aggregator):
503
+
504
+ def __init__(self):
505
+
506
+ self.name = 'mean_and_vote_at_k'
507
+
508
+ def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
509
+ """Aggregate scores by computing the vote@k for each metric using group_id.
510
+
511
+ Args:
512
+ scores: List of sample scores to aggregate
513
+
514
+ Returns:
515
+ List of aggregated scores with vote@k values
516
+ """
517
+ if not scores:
518
+ return []
519
+
520
+ metrics = list(scores[0].score.value.keys())
521
+
522
+ # Calculate vote@k for all metrics
523
+ for metric_name in metrics:
524
+
525
+ # Count of occurrences for each answer in each group_id
526
+ answer_groups = defaultdict(lambda: defaultdict(int))
527
+ # Score for each answer in each group_id
528
+ scores_groups = defaultdict(lambda: defaultdict(float))
529
+ # Score of the most frequently occurring answer
530
+ final_scores_groups = defaultdict(float)
531
+ # Count different answers for this metric
532
+ for score in scores:
533
+ group_id = getattr(score, 'group_id', score.sample_id) # fallback to sample_id if no group_id
534
+ answer_prediction = getattr(score.score, 'extracted_prediction', None)
535
+ answer_groups[group_id][answer_prediction] += 1
536
+ scores_groups[group_id][answer_prediction] = score.score.value[metric_name]
537
+ # Calculate the repetition count k for each problem
538
+ k = int(len(scores) / len(answer_groups))
539
+
540
+ # Use the score of the most frequently occurring answer as the group's score
541
+ for group_id in answer_groups:
542
+ final_scores_groups[group_id] = scores_groups[group_id][
543
+ max(answer_groups[group_id], key=answer_groups[group_id].get)]
544
+
545
+ # Add the corresponding vote@k for the metric to each score's value
546
+ for score in scores:
547
+ group_id = getattr(score, 'group_id', score.sample_id)
548
+ score.score.value.update({f'{metric_name}_vote@{k}': final_scores_groups[group_id]})
549
+
550
+ # Calculate the mean value for all metrics and their corresponding vote@k
551
+ m = Mean()
552
+ return m(scores)
553
+
554
+
555
+ @register_aggregation(name='mean_and_pass_hat_k')
556
+ class MeanPassHatK(Aggregator):
557
+
558
+ def __init__(self):
559
+ self.name = 'mean_and_pass_hat_k'
560
+
561
+ def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
562
+ """Add per-metric pass^k using calculate_pass_hat_k, then mean-aggregate.
563
+
564
+ For each metric:
565
+ - Group scores by group_id
566
+ - Collect binary correctness values
567
+ - Infer k as approximate repeats and clamp to min attempts across groups
568
+ - Compute per-group pass^k via calculate_pass_hat_k
569
+ - Annotate each sample with metric_pass^{k} for its group
570
+ Finally run Mean() over the augmented metric set.
571
+ """
572
+ if not scores:
573
+ return []
574
+
575
+ # Freeze metric names before augmenting values to avoid iterating injected keys
576
+ metrics = list(scores[0].score.value.keys())
577
+
578
+ for metric_name in metrics:
579
+ # group_id -> list[float] (0/1 correctness values)
580
+ group_values: Dict[str, List[float]] = defaultdict(list)
581
+ for s in scores:
582
+ group_id = getattr(s, 'group_id', s.sample_id)
583
+ value = float(s.score.value[metric_name])
584
+ group_values[group_id].append(value)
585
+
586
+ if not group_values:
587
+ continue
588
+
589
+ # Infer repeats and clamp to the smallest group size to satisfy k <= n
590
+ approx_k = int(len(scores) / len(group_values)) if len(group_values) > 0 else 1
591
+ min_n = min(len(vals) for vals in group_values.values())
592
+ k = max(1, min(approx_k, min_n))
593
+
594
+ # Compute per-group pass^k
595
+ pass_hat_k_map: Dict[str, float] = {}
596
+ for gid, vals in group_values.items():
597
+ n = len(vals)
598
+ c = int(sum(vals))
599
+ # calculate_pass_hat_k requires k <= n; ensured by clamping above
600
+ pass_hat_k_map[gid] = float(calculate_pass_hat_k(n, c, k))
601
+
602
+ # Annotate each sample with its group's pass^k
603
+ suffix = f'pass^{k}'
604
+ injected_key = f'{metric_name}_{suffix}'
605
+ for s in scores:
606
+ group_id = getattr(s, 'group_id', s.sample_id)
607
+ s.score.value[injected_key] = pass_hat_k_map[group_id]
608
+
609
+ # Mean aggregate over original + injected pass^k metrics
610
+ m = Mean()
611
+ return m(scores)