evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,433 @@
1
+ # flake8: noqa
2
+ import ast
3
+ import os
4
+ import re
5
+
6
+ from .IoUscore_metric import calculate_iou, extract_coordinates, vqa_with_position_evaluation
7
+ from .page_ocr_metric import cal_per_metrics
8
+ from .spotting_metric import extract_bounding_boxes_robust, spotting_evaluation
9
+ from .TEDS_metric import (
10
+ TEDS,
11
+ compute_f1_score,
12
+ convert_markdown_table_to_html,
13
+ convert_str_to_dict,
14
+ convert_str_to_multi_dict,
15
+ dict_to_html,
16
+ doc_parsing_evaluation,
17
+ generate_combinations,
18
+ wrap_html_table,
19
+ )
20
+ from .vqa_metric import (
21
+ cn_math_expression_evaluation,
22
+ cn_vqa_evaluation,
23
+ counting_evaluation,
24
+ math_expression_evaluation,
25
+ vqa_evaluation,
26
+ vqa_evaluation_case_sensitive,
27
+ )
28
+
29
+ teds = TEDS(n_jobs=os.cpu_count() or 1)
30
+
31
+
32
+ def is_nan_value(value):
33
+ if value is None:
34
+ return True
35
+ if isinstance(value, str) and value.lower() == 'nan':
36
+ return True
37
+ try:
38
+ import pandas as pd
39
+
40
+ if pd.isna(value):
41
+ return True
42
+ except:
43
+ pass
44
+ return False
45
+
46
+
47
+ def get_value_or_zero(value):
48
+ return 0.0 if value is None else value
49
+
50
+
51
+ def ocrbench_v2_process_results(doc, pred):
52
+ question = doc['question']
53
+ gt_ans = doc['answers']
54
+ data_type = doc['type']
55
+
56
+ score = 0
57
+
58
+ if (
59
+ data_type == 'APP agent en' or data_type == 'ASCII art classification en' or data_type == 'math QA en'
60
+ or data_type == 'reasoning VQA en' or data_type == 'science QA en' or data_type == 'text recognition en'
61
+ or data_type == 'document classification en' or data_type == 'cognition VQA en' or data_type == 'diagram QA en'
62
+ ):
63
+ if doc['eval'] == 'multiple choice':
64
+ if not isinstance(gt_ans, list):
65
+ gt_ans = [gt_ans]
66
+ assert len(gt_ans) == 1
67
+
68
+ if not isinstance(pred, str):
69
+ score = 0
70
+ else:
71
+ predict = ''.join(c for c in pred if c.isalpha())
72
+
73
+ if predict == gt_ans[0]:
74
+ score = 1
75
+ else:
76
+ score = 0
77
+ elif doc['eval'] == 'case sensitive':
78
+ score = vqa_evaluation_case_sensitive(pred, gt_ans)
79
+
80
+ else:
81
+ score = vqa_evaluation(pred, gt_ans)
82
+
83
+ elif data_type == 'cognition VQA cn' or data_type == 'reasoning VQA cn':
84
+ if doc['eval'] == 'multiple choice':
85
+ assert len(gt_ans) == 1
86
+ predict = ''.join(c for c in pred if c.isalpha())
87
+
88
+ if predict == gt_ans[0]:
89
+ score = 1
90
+ else:
91
+ score = 0
92
+ elif doc['eval'] == 'case sensitive':
93
+ score = vqa_evaluation_case_sensitive(pred, gt_ans)
94
+
95
+ else:
96
+ score = cn_vqa_evaluation(pred, gt_ans)
97
+
98
+ elif data_type == 'handwritten answer extraction cn':
99
+ if '简答' in question:
100
+ ocr_metric = cal_per_metrics(pred, gt_ans[0])
101
+ score = (
102
+ get_value_or_zero(ocr_metric['bleu']) + get_value_or_zero(ocr_metric['meteor'])
103
+ + get_value_or_zero(ocr_metric['f_measure']) + (1 - get_value_or_zero(ocr_metric['edit_dist']))
104
+ ) / 4
105
+ else:
106
+ assert len(gt_ans) == 1
107
+ answer = gt_ans[0]
108
+ chars = list(answer)
109
+ if len(answer) > 1:
110
+ answer_list = [
111
+ ''.join(chars), '.'.join(chars), '. '.join(chars), ','.join(chars), ', '.join(chars),
112
+ '、'.join(chars), ';'.join(chars), '; '.join(chars), ' '.join(chars), '和'.join(chars)
113
+ ]
114
+ max_score = 0
115
+ for answer in answer_list:
116
+ if answer in pred:
117
+ temp_score = 1
118
+ else:
119
+ temp_score = 0
120
+ if temp_score > max_score:
121
+ max_score = temp_score
122
+ score = max_score
123
+
124
+ else:
125
+ if gt_ans[0] in pred:
126
+ score = 1
127
+ else:
128
+ score = 0
129
+
130
+ elif data_type == 'formula recognition cn':
131
+ if is_nan_value(pred):
132
+ score = 0
133
+ else:
134
+ score = cn_math_expression_evaluation(pred, gt_ans)
135
+
136
+ elif data_type == 'text counting en':
137
+ score = counting_evaluation(pred, gt_ans, doc['eval'])
138
+
139
+ elif data_type == 'formula recognition en':
140
+ score = math_expression_evaluation(pred, gt_ans)
141
+
142
+ elif data_type == 'table parsing en':
143
+ if type(gt_ans) == list and len(gt_ans) == 1:
144
+ if not isinstance(pred, str):
145
+ score = 0
146
+
147
+ elif 'html' in question.lower():
148
+ no_find = False
149
+ predict_table = pred.replace('\n', '')
150
+ if '<body' in predict_table:
151
+ predict_table = re.findall('<body.*', predict_table)[0]
152
+ elif '<table' in predict_table:
153
+ predict_table = re.findall('<table.*', predict_table)[0]
154
+ else:
155
+ no_find = True
156
+
157
+ if no_find:
158
+ score = 0
159
+ else:
160
+ pred_table_html = wrap_html_table(predict_table)
161
+ gold_table_html = wrap_html_table(gt_ans[0])
162
+ try:
163
+ score = teds.evaluate(pred_table_html, gold_table_html)
164
+ except:
165
+ score = 0
166
+
167
+ elif 'markdown' in question.lower():
168
+ if not isinstance(pred, str):
169
+ prediction = str(pred)
170
+ pred_table_html = convert_markdown_table_to_html(prediction)
171
+ gt_table_html = convert_markdown_table_to_html(gt_ans[0])
172
+ score = teds.evaluate(pred_table_html, gt_table_html)
173
+
174
+ else:
175
+ pred_table_html = convert_markdown_table_to_html(pred)
176
+ gt_table_html = convert_markdown_table_to_html(gt_ans[0])
177
+ score = teds.evaluate(pred_table_html, gt_table_html)
178
+ else:
179
+ raise ValueError
180
+
181
+ elif data_type == 'table parsing cn':
182
+ if not isinstance(pred, str):
183
+ score = 0
184
+ else:
185
+ no_find = False
186
+ predict_table = pred.replace('\n', '')
187
+ if '<body' in predict_table:
188
+ predict_table = re.findall('<body.*', predict_table)[0]
189
+ elif '<table' in predict_table:
190
+ predict_table = re.findall('<table.*', predict_table)[0]
191
+ else:
192
+ no_find = True
193
+
194
+ if no_find:
195
+ score = 0
196
+ else:
197
+ pred_table_html = wrap_html_table(predict_table)
198
+ gold_table_html = wrap_html_table(gt_ans[0])
199
+ try:
200
+ score = teds.evaluate(pred_table_html, gold_table_html)
201
+ except:
202
+ score = 0
203
+ print('error')
204
+
205
+ elif data_type == 'chart parsing en':
206
+ answer = gt_ans[0]
207
+ if pred:
208
+ pred_chart_dict = convert_str_to_multi_dict(pred)
209
+ if len(pred_chart_dict) == 0:
210
+ score = 0
211
+ else:
212
+ pred_chart_html = dict_to_html(pred_chart_dict)
213
+ if isinstance(answer, str):
214
+ answer = convert_str_to_multi_dict(pred)
215
+ gt_chart_html = dict_to_html(answer)
216
+ score = teds.evaluate(pred_chart_html, gt_chart_html)
217
+ else:
218
+ score = 0
219
+
220
+ elif data_type == 'document parsing en':
221
+ assert type(gt_ans) == list and len(gt_ans) == 1
222
+ score = doc_parsing_evaluation(pred, gt_ans[0])
223
+
224
+ elif data_type == 'document parsing cn':
225
+ assert type(gt_ans) == list and len(gt_ans) == 1
226
+ score = doc_parsing_evaluation(pred, gt_ans[0])
227
+
228
+ elif data_type == 'key information extraction en' or data_type == 'key information mapping en':
229
+ assert len(gt_ans) == 1
230
+ answers = generate_combinations(gt_ans[0])
231
+
232
+ if type(answers) == list and len(answers) == 1:
233
+ if not isinstance(pred, str):
234
+ score = 0
235
+ else:
236
+ pred_kie_dict = convert_str_to_dict(pred)
237
+ score = compute_f1_score(pred_kie_dict, answers[0])
238
+ else:
239
+ max_score = 0
240
+ for answer in answers:
241
+ pred_kie_dict = convert_str_to_dict(pred)
242
+ score = compute_f1_score(pred_kie_dict, answer)
243
+
244
+ if score > max_score:
245
+ max_score = score
246
+ score = max_score
247
+
248
+ elif data_type == 'key information extraction cn':
249
+ assert len(gt_ans) == 1
250
+ answers = ast.literal_eval(gt_ans[0])
251
+ answers = {k: v if isinstance(v, list) else [v] for k, v in answers.items()}
252
+ answers = generate_combinations(answers)
253
+ if type(answers) == list and len(answers) == 1:
254
+ if not isinstance(pred, str):
255
+ score = 0
256
+ else:
257
+ pred_kie_dict = convert_str_to_dict(pred)
258
+ score = compute_f1_score(pred_kie_dict, answers[0])
259
+ else:
260
+ max_score = 0
261
+ for answer in answers:
262
+ pred_kie_dict = convert_str_to_dict(pred)
263
+ score = compute_f1_score(pred_kie_dict, answer)
264
+
265
+ if score > max_score:
266
+ max_score = score
267
+ score = max_score
268
+
269
+ elif data_type == 'VQA with position en':
270
+ if not isinstance(pred, str):
271
+ score = 0
272
+ else:
273
+ pred_dict = convert_str_to_dict(pred)
274
+ score = vqa_with_position_evaluation(pred_dict, doc)
275
+
276
+ elif data_type == 'text translation cn':
277
+ if len(pred) == 0:
278
+ score = 0
279
+ else:
280
+ ocr_metric = cal_per_metrics(pred, gt_ans[0])
281
+ score = (
282
+ ocr_metric['bleu'] + ocr_metric['meteor'] + ocr_metric['f_measure'] + (1 - ocr_metric['edit_dist'])
283
+ ) / 4
284
+
285
+ elif data_type == 'fine-grained text recognition en':
286
+ if not isinstance(pred, str):
287
+ score = 0
288
+ elif len(pred) == 0:
289
+ score = 0
290
+ else:
291
+ ocr_metric = cal_per_metrics(pred, gt_ans[0])
292
+ score = (
293
+ get_value_or_zero(ocr_metric['bleu']) + get_value_or_zero(ocr_metric['meteor'])
294
+ + get_value_or_zero(ocr_metric['f_measure']) + (1 - get_value_or_zero(ocr_metric['edit_dist']))
295
+ ) / 4
296
+ elif data_type == 'full-page OCR en':
297
+ if not pred:
298
+ score = 0
299
+ else:
300
+ ocr_metric = cal_per_metrics(pred, gt_ans[0])
301
+ score = (
302
+ get_value_or_zero(ocr_metric['bleu']) + get_value_or_zero(ocr_metric['meteor'])
303
+ + get_value_or_zero(ocr_metric['f_measure']) + (1 - get_value_or_zero(ocr_metric['edit_dist']))
304
+ ) / 4
305
+
306
+ elif data_type == 'full-page OCR cn':
307
+ if not isinstance(pred, str):
308
+ score = 0
309
+ else:
310
+ if len(pred) == 0:
311
+ score = 0
312
+ else:
313
+ ocr_metric = cal_per_metrics(pred, gt_ans[0])
314
+ score = (
315
+ ocr_metric['bleu'] + ocr_metric['meteor'] + ocr_metric['f_measure'] + (1 - ocr_metric['edit_dist'])
316
+ ) / 4
317
+
318
+ elif data_type == 'text grounding en':
319
+ if not isinstance(pred, str):
320
+ score = 0
321
+ else:
322
+ predict_bbox = extract_coordinates(pred)
323
+ if not predict_bbox:
324
+ score = 0
325
+ else:
326
+ score = calculate_iou(predict_bbox, gt_ans)
327
+
328
+ elif data_type == 'text spotting en':
329
+ if not isinstance(pred, str):
330
+ score = 0
331
+ else:
332
+ predict_bbox = extract_bounding_boxes_robust(pred)
333
+ if not predict_bbox:
334
+ score = 0
335
+ else:
336
+ score = spotting_evaluation(predict_bbox, doc)
337
+
338
+ return score
339
+
340
+
341
+ def calculate_average_score(categories, OCRBench_v2_score):
342
+ return sum(
343
+ sum(OCRBench_v2_score[cat]) / len(OCRBench_v2_score[cat]) if len(OCRBench_v2_score[cat]) > 0 else 0
344
+ for cat in categories
345
+ ) / len(categories)
346
+
347
+
348
+ def ocrbench_v2_aggregate_accuracy(results):
349
+ question_type_scores = {}
350
+ OCRBench_v2_score = {
351
+ 'text_recognition_en': [],
352
+ 'text_detection_en': [],
353
+ 'text_spotting_en': [],
354
+ 'relationship_extraction_en': [],
355
+ 'element_parsing_en': [],
356
+ 'mathematical_calculation_en': [],
357
+ 'visual_text_understanding_en': [],
358
+ 'knowledge_reasoning_en': [],
359
+ 'text_recognition_cn': [],
360
+ 'relationship_extraction_cn': [],
361
+ 'element_parsing_cn': [],
362
+ 'visual_text_understanding_cn': [],
363
+ 'knowledge_reasoning_cn': [],
364
+ }
365
+
366
+ for result in results:
367
+
368
+ question_type = result['question_type']
369
+ score = result['score']
370
+
371
+ if question_type not in question_type_scores:
372
+ question_type_scores[question_type] = []
373
+ question_type_scores[question_type].append(score)
374
+
375
+ if question_type in ['text recognition en', 'fine-grained text recognition en', 'full-page OCR en']:
376
+ OCRBench_v2_score['text_recognition_en'].append(score)
377
+
378
+ elif question_type in ['text grounding en', 'VQA with position en']:
379
+ OCRBench_v2_score['text_detection_en'].append(score)
380
+
381
+ elif question_type == 'text spotting en':
382
+ OCRBench_v2_score['text_spotting_en'].append(score)
383
+
384
+ elif question_type in ['key information extraction en', 'key information mapping en']:
385
+ OCRBench_v2_score['relationship_extraction_en'].append(score)
386
+
387
+ elif question_type in ['document parsing en', 'chart parsing en', 'table parsing en', 'formula recognition en']:
388
+ OCRBench_v2_score['element_parsing_en'].append(score)
389
+
390
+ elif question_type in ['math QA en', 'text counting en']:
391
+ OCRBench_v2_score['mathematical_calculation_en'].append(score)
392
+
393
+ elif question_type in ['document classification en', 'cognition VQA en', 'diagram QA en']:
394
+ OCRBench_v2_score['visual_text_understanding_en'].append(score)
395
+
396
+ elif question_type in ['reasoning VQA en', 'science QA en', 'APP agent en', 'ASCII art classification en']:
397
+ OCRBench_v2_score['knowledge_reasoning_en'].append(score)
398
+
399
+ elif question_type == 'full-page OCR cn':
400
+ OCRBench_v2_score['text_recognition_cn'].append(score)
401
+
402
+ elif question_type in ['key information extraction cn', 'handwritten answer extraction cn']:
403
+ OCRBench_v2_score['relationship_extraction_cn'].append(score)
404
+
405
+ elif question_type in ['document parsing cn', 'table parsing cn', 'formula recognition cn']:
406
+ OCRBench_v2_score['element_parsing_cn'].append(score)
407
+
408
+ elif question_type == 'cognition VQA cn':
409
+ OCRBench_v2_score['visual_text_understanding_cn'].append(score)
410
+
411
+ elif question_type in ['reasoning VQA cn', 'text translation cn']:
412
+ OCRBench_v2_score['knowledge_reasoning_cn'].append(score)
413
+
414
+ else:
415
+ print('No such task!')
416
+ raise TypeError
417
+
418
+ english_tasks = [
419
+ 'text_recognition_en', 'text_detection_en', 'text_spotting_en', 'relationship_extraction_en',
420
+ 'element_parsing_en', 'mathematical_calculation_en', 'visual_text_understanding_en', 'knowledge_reasoning_en'
421
+ ]
422
+
423
+ chinese_tasks = [
424
+ 'text_recognition_cn', 'relationship_extraction_cn', 'element_parsing_cn', 'visual_text_understanding_cn',
425
+ 'knowledge_reasoning_cn'
426
+ ]
427
+
428
+ OCRBench_v2_English_subset_score = calculate_average_score(english_tasks, OCRBench_v2_score)
429
+ OCRBench_v2_Chinese_subset_score = calculate_average_score(chinese_tasks, OCRBench_v2_score)
430
+
431
+ Final_score = (OCRBench_v2_English_subset_score + OCRBench_v2_Chinese_subset_score) / 2
432
+
433
+ return Final_score # return the final score as accuracy
@@ -0,0 +1,254 @@
1
+ # flake8: noqa
2
+ import math
3
+ import re
4
+
5
+
6
+ def levenshtein_distance(s1, s2):
7
+ if len(s1) > len(s2):
8
+ s1, s2 = s2, s1
9
+
10
+ distances = range(len(s1) + 1)
11
+ for i2, c2 in enumerate(s2):
12
+ distances_ = [i2 + 1]
13
+ for i1, c1 in enumerate(s1):
14
+ if c1 == c2:
15
+ distances_.append(distances[i1])
16
+ else:
17
+ distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
18
+ distances = distances_
19
+ return distances[-1]
20
+
21
+
22
+ def vqa_evaluation(predict, answers):
23
+ score = 0
24
+ if isinstance(answers, list):
25
+ predict_str = str(predict).lower().strip().replace('\n', ' ')
26
+ for ans in answers:
27
+ answer = str(ans).lower().strip().replace('\n', ' ')
28
+ if len(answer.split()) < 5:
29
+ if answer in predict_str:
30
+ score = 1
31
+ else:
32
+ dist = levenshtein_distance(predict_str, answer)
33
+ length = max(len(predict_str), len(answer))
34
+ ANLS_value = 0.0 if length == 0 else float(dist) / float(length)
35
+ ANLS_value = 1 - ANLS_value
36
+
37
+ if ANLS_value >= 0.5 and ANLS_value > score:
38
+ score = ANLS_value
39
+
40
+ else:
41
+ answer = str(answers).lower().strip().replace('\n', ' ')
42
+ predict_str = str(predict).lower().strip().replace('\n', ' ')
43
+ if len(answer.split()) < 5:
44
+ if answer in predict_str:
45
+ score = 1
46
+ else:
47
+ dist = levenshtein_distance(predict_str, answer)
48
+ length = max(len(predict_str), len(answer))
49
+ ANLS_value = 0.0 if length == 0 else float(dist) / float(length)
50
+ ANLS_value = 1 - ANLS_value
51
+
52
+ if ANLS_value >= 0.5 and ANLS_value > score:
53
+ score = ANLS_value
54
+
55
+ return score
56
+
57
+
58
+ def cn_vqa_evaluation(predict, answers):
59
+ score = 0
60
+ if isinstance(answers, list):
61
+ predict_str = str(predict).lower().strip().replace('\n', ' ').replace(' ', '')
62
+ for ans in answers:
63
+ answer = str(ans).lower().strip().replace('\n', ' ').replace(' ', '')
64
+ if len(answer.split(',')) < 4:
65
+ if answer in predict_str:
66
+ score = 1
67
+ else:
68
+ dist = levenshtein_distance(predict_str, answer)
69
+ length = max(len(predict_str), len(answer))
70
+ ANLS_value = 0.0 if length == 0 else float(dist) / float(length)
71
+ ANLS_value = 1 - ANLS_value
72
+
73
+ if ANLS_value >= 0.5 and ANLS_value > score:
74
+ score = ANLS_value
75
+
76
+ else:
77
+ answer = str(answers).lower().strip().replace('\n', ' ').replace(' ', '')
78
+ predict_str = str(predict).lower().strip().replace('\n', ' ').replace(' ', '')
79
+ if len(answer.split(',')) < 4:
80
+ if answer in predict_str:
81
+ score = 1
82
+ else:
83
+ dist = levenshtein_distance(predict_str, answer)
84
+ length = max(len(predict_str), len(answer))
85
+ ANLS_value = 0.0 if length == 0 else float(dist) / float(length)
86
+ ANLS_value = 1 - ANLS_value
87
+
88
+ if ANLS_value >= 0.5 and ANLS_value > score:
89
+ score = ANLS_value
90
+
91
+ return score
92
+
93
+
94
+ def vqa_evaluation_case_sensitive(predict, answers):
95
+ score = 0
96
+ if isinstance(answers, list):
97
+ predict_str = str(predict).strip().replace('\n', ' ')
98
+ for ans in answers:
99
+ answer = str(ans).strip().replace('\n', ' ')
100
+ if len(answer.split()) < 5:
101
+ if answer in predict_str:
102
+ score = 1
103
+ else:
104
+ dist = levenshtein_distance(predict_str, answer)
105
+ length = max(len(predict_str), len(answer))
106
+ ANLS_value = 0.0 if length == 0 else float(dist) / float(length)
107
+ ANLS_value = 1 - ANLS_value
108
+
109
+ if ANLS_value >= 0.5 and ANLS_value > score:
110
+ score = ANLS_value
111
+
112
+ else:
113
+ answer = str(answers).strip().replace('\n', ' ')
114
+ predict_str = str(predict).strip().replace('\n', ' ')
115
+ if len(answer.split()) < 5:
116
+ if answer in predict_str:
117
+ score = 1
118
+ else:
119
+ dist = levenshtein_distance(predict_str, answer)
120
+ length = max(len(predict_str), len(answer))
121
+ ANLS_value = 0.0 if length == 0 else float(dist) / float(length)
122
+ ANLS_value = 1 - ANLS_value
123
+
124
+ if ANLS_value >= 0.5 and ANLS_value > score:
125
+ score = ANLS_value
126
+
127
+ return score
128
+
129
+
130
+ def extract_first_number(string):
131
+ match = re.search(r'\d+', string)
132
+ if match:
133
+ return int(match.group())
134
+ return None
135
+
136
+
137
+ def counting_evaluation(predict, answers, eval_method):
138
+ score = 0
139
+
140
+ # normalize predict to string for both matching and number extraction
141
+ if isinstance(predict, str):
142
+ predict_str = predict.lower().strip().replace('\n', ' ')
143
+ elif isinstance(predict, (int, float)):
144
+ if isinstance(predict, float) and math.isnan(predict):
145
+ return 0
146
+ predict_str = str(predict).lower().strip().replace('\n', ' ')
147
+ else:
148
+ predict_str = str(predict).lower().strip().replace('\n', ' ')
149
+
150
+ if isinstance(answers, list):
151
+ temp_score = 0
152
+ for ans in answers:
153
+ answer = str(ans).lower().strip().replace('\n', ' ')
154
+ if eval_method == 'exact match':
155
+ score = 1 if answer in predict_str else 0
156
+ elif eval_method == 'regression':
157
+ predict_number = extract_first_number(predict_str)
158
+ if predict_number is not None:
159
+ try:
160
+ answer_int = int(answer)
161
+ except ValueError:
162
+ score = 0
163
+ else:
164
+ if predict_number <= 0 or predict_number >= 2 * answer_int:
165
+ score = 0
166
+ else:
167
+ iou = 1 - abs(predict_number - answer_int) / answer_int
168
+ score = iou if iou > 0.5 else 0
169
+ else:
170
+ score = 0
171
+ if score > temp_score:
172
+ temp_score = score
173
+ score = temp_score
174
+
175
+ else:
176
+ answer = str(answers).lower().strip().replace('\n', ' ')
177
+ if eval_method == 'exact match':
178
+ score = 1 if answer in predict_str else 0
179
+ elif eval_method == 'regression':
180
+ predict_number = extract_first_number(predict_str)
181
+ if predict_number is not None:
182
+ try:
183
+ answer_int = int(answer)
184
+ except ValueError:
185
+ score = 0
186
+ else:
187
+ if predict_number <= 0 or predict_number >= 2 * answer_int:
188
+ score = 0
189
+ else:
190
+ iou = 1 - abs(predict_number - answer_int) / answer_int
191
+ score = iou if iou > 0.5 else 0
192
+ else:
193
+ score = 0
194
+ return score
195
+
196
+
197
+ def math_expression_evaluation(predict, answers):
198
+ score = 0
199
+ if type(answers) == list:
200
+ for j in range(len(answers)):
201
+ answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
202
+ predict = predict.strip().replace('\n', ' ').replace(' ', '')
203
+ if answer in predict:
204
+ score = 1
205
+ else:
206
+ answers = answers.strip().replace('\n', ' ').replace(' ', '')
207
+ predict = predict.strip().replace('\n', ' ').replace(' ', '')
208
+ if answers in predict:
209
+ score = 1
210
+ return score
211
+
212
+
213
+ def remove_text_tags(latex_str):
214
+ """
215
+ Removes LaTeX \text{...} tags while keeping their content.
216
+
217
+ :param latex_str: A string containing LaTeX expressions
218
+ :return: The processed string with \text{...} tags removed
219
+ """
220
+
221
+ pattern = r'\\text\{([^{}]*)\}'
222
+
223
+ processed_str = re.sub(pattern, r'\1', latex_str)
224
+
225
+ return processed_str
226
+
227
+
228
+ def cn_math_expression_evaluation(predict, answers):
229
+ score = 0
230
+
231
+ assert len(answers) == 1
232
+ answers = [remove_text_tags(answers[0])]
233
+ predict = remove_text_tags(predict)
234
+
235
+ if type(answers) == list:
236
+ for j in range(len(answers)):
237
+ answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
238
+ predict = predict.strip().replace('\n', ' ').replace(' ', '')
239
+ if answer in predict:
240
+ score = 1
241
+ else:
242
+ answers = answers.strip().replace('\n', ' ').replace(' ', '')
243
+ predict = predict.strip().replace('\n', ' ').replace(' ', '')
244
+ if answers in predict:
245
+ score = 1
246
+ return score
247
+
248
+
249
+ if __name__ == '__main__':
250
+ test_predict = 'apple pie and banana'
251
+ test_answers = ['apple', 'banana pie', 'apple pie and orange']
252
+
253
+ vqa_score = vqa_evaluation(test_predict, test_answers)
254
+ print(f"VQA evaluation score for predict '{test_predict}' and answers {test_answers}: {vqa_score}")
File without changes