evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,410 @@
1
+ from __future__ import annotations
2
+
3
+ import traceback
4
+ from collections import defaultdict
5
+ from copy import deepcopy
6
+ from pathlib import Path
7
+ from tqdm import tqdm
8
+ from typing import Any, Dict, List, Tuple
9
+
10
+ from evalscope.report import (
11
+ Category,
12
+ Report,
13
+ Subset,
14
+ percentage_weighted_average_from_subsets,
15
+ unweighted_average_from_subsets,
16
+ weighted_average_from_subsets,
17
+ )
18
+
19
+ # ----------------------------
20
+ # Public constants (extracted)
21
+ # ----------------------------
22
+
23
+ ALL_AVAILABLE_MEMORY_BACKENDS: List[str] = [
24
+ 'kv',
25
+ 'vector',
26
+ 'rec_sum',
27
+ ]
28
+
29
+ NON_LIVE_CATEGORY: List[str] = [
30
+ 'simple_python',
31
+ 'simple_java',
32
+ 'simple_javascript',
33
+ 'multiple',
34
+ 'parallel',
35
+ 'parallel_multiple',
36
+ 'irrelevance',
37
+ ]
38
+ LIVE_CATEGORY: List[str] = [
39
+ 'live_simple',
40
+ 'live_multiple',
41
+ 'live_parallel',
42
+ 'live_parallel_multiple',
43
+ 'live_irrelevance',
44
+ 'live_relevance',
45
+ ]
46
+ MULTI_TURN_CATEGORY: List[str] = [
47
+ 'multi_turn_base',
48
+ 'multi_turn_miss_func',
49
+ 'multi_turn_miss_param',
50
+ 'multi_turn_long_context',
51
+ ]
52
+ WEB_SEARCH_CATEGORY: List[str] = [
53
+ 'web_search_base',
54
+ 'web_search_no_snippet',
55
+ ]
56
+
57
+ MEMORY_CATEGORY: List[str] = [f'memory_{backend}' for backend in ALL_AVAILABLE_MEMORY_BACKENDS]
58
+ MEMORY_SCENARIO_NAME = [
59
+ 'student',
60
+ 'customer',
61
+ 'finance',
62
+ 'healthcare',
63
+ 'notetaker',
64
+ ]
65
+
66
+ SINGLE_TURN_CATEGORY: List[str] = NON_LIVE_CATEGORY + LIVE_CATEGORY
67
+ AGENTIC_CATEGORY: List[str] = MEMORY_CATEGORY + WEB_SEARCH_CATEGORY
68
+
69
+ ALL_SCORING_CATEGORIES: List[str] = SINGLE_TURN_CATEGORY + MULTI_TURN_CATEGORY + AGENTIC_CATEGORY
70
+
71
+ # Dummy models used only to infer underscore_to_dot behavior
72
+ DUMMY_MODEL_UNDERSCORE_TO_DOT = 'gpt-4o-2024-11-20-FC'
73
+ DUMMY_MODEL_NO_UNDERSCORE_TO_DOT = 'meta-llama/Llama-3.3-70B-Instruct-FC'
74
+
75
+ # ----------------------------
76
+ # Data preparation helpers
77
+ # ----------------------------
78
+
79
+
80
+ def load_bfcl_data(categories: List[str]) -> Tuple[Dict[str, List[Dict]], Dict[str, List[Dict]]]:
81
+ """
82
+ Load test entries and ground truth data from bfcl_eval for given categories.
83
+ """
84
+ from bfcl_eval.utils import is_relevance_or_irrelevance, load_dataset_entry, load_ground_truth_entry
85
+
86
+ test_entries_by_cat: Dict[str, List[Dict]] = defaultdict(list)
87
+ ground_truth_by_cat: Dict[str, List[Dict]] = defaultdict(list)
88
+
89
+ for category in categories:
90
+ test_entries_by_cat[category] = load_dataset_entry(
91
+ category, include_prereq=True, include_language_specific_hint=False
92
+ )
93
+ if not is_relevance_or_irrelevance(category):
94
+ ground_truth_by_cat[category] = load_ground_truth_entry(category)
95
+
96
+ return test_entries_by_cat, ground_truth_by_cat
97
+
98
+
99
+ def prepare_ground_truth_map(category: str, ground_truth_entries: List[Dict]) -> Dict[str, Dict]:
100
+ """
101
+ Map ground truth entries to IDs with category-specific adjustments.
102
+ """
103
+ from bfcl_eval.utils import is_memory, is_web_search
104
+
105
+ if not ground_truth_entries:
106
+ return {}
107
+
108
+ if is_memory(category):
109
+ return {entry['id'].replace('memory', category): entry for entry in ground_truth_entries}
110
+ if is_web_search(category):
111
+ return {entry['id'].replace('web_search', category): entry for entry in ground_truth_entries}
112
+ return {entry['id']: entry for entry in ground_truth_entries}
113
+
114
+
115
+ def process_test_entries(
116
+ category: str,
117
+ test_entries: List[Dict[str, Any]],
118
+ ground_truth_entries: List[Dict[str, Any]],
119
+ model_result_dir: Path,
120
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
121
+ """
122
+ Clean and enrich test entries, return processed entries and prereq entries.
123
+
124
+ Returns:
125
+ processed_entries: entries ready to be mapped to Samples
126
+ prereq_entries: entries requiring prereq inference (memory snapshots)
127
+ """
128
+ from bfcl_eval.utils import (
129
+ clean_up_memory_prereq_entries,
130
+ is_memory_prereq,
131
+ populate_initial_settings_for_memory_test_cases,
132
+ populate_initial_settings_for_web_search_test_cases,
133
+ )
134
+
135
+ ground_truth_map = prepare_ground_truth_map(category, ground_truth_entries)
136
+
137
+ test_entries = clean_up_memory_prereq_entries(test_entries)
138
+ test_entries = populate_initial_settings_for_web_search_test_cases(test_entries)
139
+ test_entries = populate_initial_settings_for_memory_test_cases(test_entries, model_result_dir=model_result_dir)
140
+
141
+ prereq_entries = [entry for entry in test_entries if is_memory_prereq(entry['id'])]
142
+ main_entries = [entry for entry in test_entries if not is_memory_prereq(entry['id'])]
143
+
144
+ processed_entries: List[Dict[str, Any]] = []
145
+ for entry in main_entries:
146
+ entry_id = entry['id']
147
+ entry['category'] = category
148
+ entry['ground_truth'] = ground_truth_map.get(entry_id, {}).get('ground_truth', {})
149
+ processed_entries.append(entry)
150
+
151
+ return processed_entries, prereq_entries
152
+
153
+
154
+ def run_prereq_inference(
155
+ handler: Any,
156
+ prereq_entries: List[Dict[str, Any]],
157
+ model_result_dir: Path,
158
+ batch_size: int,
159
+ logger: Any,
160
+ ) -> None:
161
+ """
162
+ Run prerequisite inferences for memory snapshot creation if results are missing.
163
+ Optimized to run different (backend, scenario) groups in parallel while preserving in-group order.
164
+ """
165
+ import re
166
+ from bfcl_eval.utils import get_directory_structure_by_id
167
+ from concurrent.futures import ThreadPoolExecutor, as_completed
168
+
169
+ if not prereq_entries:
170
+ return
171
+
172
+ def _parse_backend_scenario_idx(entry_id: str) -> Tuple[str, str, int]:
173
+ """
174
+ Extract backend, scenario, and scenario index from an entry id.
175
+ Expected format:
176
+ memory_{backend}_prereq_{total_index}-{scenario}-{scenario_index}
177
+ Returns ('unknown', 'unknown', 0) on failure.
178
+ """
179
+ backend = 'unknown'
180
+ scenario = 'unknown'
181
+ idx = 0
182
+
183
+ m_backend = re.search(r'^memory_(?P<backend>.+?)_prereq_', entry_id)
184
+ if m_backend:
185
+ backend = m_backend.group('backend')
186
+
187
+ m_tail = re.search(r'-(?P<scenario>[a-zA-Z_]+)-(?P<idx>\d+)$', entry_id)
188
+ if m_tail:
189
+ scenario = m_tail.group('scenario')
190
+ idx = int(m_tail.group('idx'))
191
+
192
+ return backend, scenario, idx
193
+
194
+ # Group entries by (backend, scenario)
195
+ groups: Dict[Tuple[str, str], List[Dict[str, Any]]] = {}
196
+ for entry in prereq_entries:
197
+ backend, scenario, idx = _parse_backend_scenario_idx(entry['id'])
198
+ entry['_group_backend'] = backend
199
+ entry['_group_scenario'] = scenario
200
+ entry['_scenario_idx'] = idx
201
+ groups.setdefault((backend, scenario), []).append(entry)
202
+
203
+ # Sort entries within each group by scenario index to keep order
204
+ for group_entries in groups.values():
205
+ group_entries.sort(key=lambda e: e.get('_scenario_idx', 0))
206
+
207
+ # Worker to process a single (backend, scenario) group sequentially
208
+ def _process_group_entries(group_entries: List[Dict[str, Any]], progress: Any) -> None:
209
+ for entry in group_entries:
210
+ try:
211
+ memory_snapshot_folder = (
212
+ model_result_dir / get_directory_structure_by_id(entry['id']) / 'memory_snapshot'
213
+ / 'prereq_checkpoints'
214
+ )
215
+ existing_filenames = {f.name for f in memory_snapshot_folder.rglob('*.json')}
216
+ if (entry['id'] + '.json') in existing_filenames:
217
+ logger.info(f'Skipping prereq inference for entry ID {entry["id"]} as result already exists.')
218
+ else:
219
+ handler.inference(deepcopy(entry), include_input_log=False, exclude_state_log=False)
220
+ except Exception as e:
221
+ logger.error(f'Error during prereq inference for entry ID {entry.get("id")}: {e}')
222
+ logger.error(traceback.format_exc())
223
+ finally:
224
+ # tqdm is thread-safe; each worker updates shared progress bar
225
+ progress.update(1)
226
+
227
+ # Run each (backend, scenario) group in parallel; preserve in-group order
228
+ total = len(prereq_entries)
229
+ with tqdm(total=total, desc='Running prereq inferences for memory snapshots...') as progress:
230
+ max_workers = min(batch_size, len(groups))
231
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
232
+ futures = [
233
+ executor.submit(_process_group_entries, group_entries, progress) for group_entries in groups.values()
234
+ ]
235
+ for _ in as_completed(futures):
236
+ # Errors are logged within workers
237
+ pass
238
+
239
+ # Cleanup temp keys
240
+ for group_entries in groups.values():
241
+ for entry in group_entries:
242
+ entry.pop('_group_backend', None)
243
+ entry.pop('_group_scenario', None)
244
+ entry.pop('_scenario_idx', None)
245
+
246
+
247
+ # ----------------------------
248
+ # Scoring helpers
249
+ # ----------------------------
250
+
251
+
252
+ def compute_entry_result(
253
+ handler: Any,
254
+ model_result: Any,
255
+ prompt_entry: Dict[str, Any],
256
+ underscore_to_dot: bool,
257
+ ) -> Dict[str, Any]:
258
+ """
259
+ Compute evaluation result for a single entry across BFCL categories.
260
+ """
261
+ from bfcl_eval.constants.enums import Language, ReturnFormat
262
+ from bfcl_eval.eval_checker.eval_runner import (
263
+ _evaluate_single_agentic_entry,
264
+ _evaluate_single_ast_entry,
265
+ _evaluate_single_multi_turn_entry,
266
+ _evaluate_single_relevance_entry,
267
+ )
268
+ from bfcl_eval.utils import is_agentic, is_java, is_js, is_multi_turn, is_relevance_or_irrelevance
269
+
270
+ test_category = prompt_entry['category']
271
+ index = prompt_entry['id']
272
+ ground_truth = prompt_entry.get('ground_truth', {})
273
+
274
+ model_name = (DUMMY_MODEL_UNDERSCORE_TO_DOT if underscore_to_dot else DUMMY_MODEL_NO_UNDERSCORE_TO_DOT)
275
+
276
+ if is_relevance_or_irrelevance(test_category):
277
+ return _evaluate_single_relevance_entry(
278
+ handler=handler,
279
+ index=index,
280
+ model_result_item=model_result,
281
+ prompt_entry=prompt_entry,
282
+ model_name=model_name,
283
+ test_category=test_category,
284
+ )
285
+
286
+ elif is_multi_turn(test_category):
287
+ return _evaluate_single_multi_turn_entry(
288
+ handler=handler,
289
+ test_entry_id=index,
290
+ model_result_list=model_result,
291
+ ground_truth_list=ground_truth,
292
+ prompt_entry=prompt_entry,
293
+ model_name=model_name,
294
+ test_category=test_category,
295
+ )
296
+
297
+ elif is_agentic(test_category):
298
+ return _evaluate_single_agentic_entry(
299
+ handler=handler,
300
+ index=index,
301
+ model_result_list=model_result,
302
+ possible_answer_item=ground_truth,
303
+ prompt_entry=prompt_entry,
304
+ model_name=model_name,
305
+ test_category=test_category,
306
+ )
307
+ else:
308
+ # AST categories (python/java/js)
309
+ if is_java(test_category):
310
+ language = Language.JAVA
311
+ return_format = ReturnFormat.JAVA
312
+ elif is_js(test_category):
313
+ language = Language.JAVASCRIPT
314
+ return_format = ReturnFormat.JAVASCRIPT
315
+ else:
316
+ language = Language.PYTHON
317
+ return_format = ReturnFormat.PYTHON
318
+
319
+ return _evaluate_single_ast_entry(
320
+ handler=handler,
321
+ index=index,
322
+ model_result_item=model_result,
323
+ possible_answer_item=ground_truth,
324
+ prompt_entry=prompt_entry,
325
+ model_name=model_name,
326
+ test_category=test_category,
327
+ language=language,
328
+ return_format=return_format,
329
+ )
330
+
331
+
332
+ # ----------------------------
333
+ # Report aggregation helpers
334
+ # ----------------------------
335
+
336
+
337
+ def compute_aggregate_subsets(report: Report) -> None:
338
+ """
339
+ Compute aggregated subsets and overall score for BFCL report.
340
+ Modifies the report in-place.
341
+ """
342
+ for metric in report.metrics:
343
+ # Collect all subsets in a dictionary for easy access
344
+ subset_dict: Dict[str, Subset] = {}
345
+ for category in metric.categories:
346
+ for subset in category.subsets:
347
+ subset_dict[subset.name] = subset
348
+
349
+ # Step 1: simple_ast
350
+ simple_subsets = ['simple_python', 'simple_java', 'simple_javascript']
351
+ simple_ast = unweighted_average_from_subsets(simple_subsets, subset_dict)
352
+ subset_dict['simple_ast'] = simple_ast
353
+
354
+ # Step 2.1: non_live (simple_ast, multiple, parallel, parallel_multiple)
355
+ non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
356
+ non_live = unweighted_average_from_subsets(non_live_subsets, subset_dict)
357
+ subset_dict['non_live'] = non_live
358
+
359
+ # Step 2.2: live (weighted)
360
+ live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
361
+ live = weighted_average_from_subsets(live_subsets, subset_dict)
362
+ subset_dict['live'] = live
363
+
364
+ # Step 2.3: hallucination (unweighted)
365
+ hallucination_subsets = ['live_irrelevance', 'irrelevance']
366
+ hallucination = unweighted_average_from_subsets(hallucination_subsets, subset_dict)
367
+ subset_dict['hallucination'] = hallucination
368
+
369
+ # Step 2.4: multi_turn (unweighted)
370
+ multi_turn_subsets = [
371
+ 'multi_turn_base',
372
+ 'multi_turn_miss_func',
373
+ 'multi_turn_miss_param',
374
+ 'multi_turn_long_context',
375
+ ]
376
+ multi_turn = unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
377
+ subset_dict['multi_turn'] = multi_turn
378
+
379
+ # Step 2.5: web_search (unweighted)
380
+ web_search_subsets = ['web_search_base', 'web_search_no_snippet']
381
+ web_search = unweighted_average_from_subsets(web_search_subsets, subset_dict)
382
+ subset_dict['web_search'] = web_search
383
+
384
+ # Step 2.6: memory (unweighted)
385
+ memory_subsets = ['memory_kv', 'memory_vector', 'memory_rec_sum']
386
+ memory = unweighted_average_from_subsets(memory_subsets, subset_dict)
387
+ subset_dict['memory'] = memory
388
+
389
+ # Step 2.7: agentic (unweighted)
390
+ agentic_subsets = ['web_search', 'memory']
391
+ agentic = unweighted_average_from_subsets(agentic_subsets, subset_dict)
392
+ subset_dict['agentic'] = agentic
393
+
394
+ # Step 4: overall (percentage weighted average)
395
+ overall_subsets = ['agentic', 'multi_turn', 'non_live', 'live', 'hallucination']
396
+ overall = percentage_weighted_average_from_subsets(overall_subsets, subset_dict, weights=[40, 30, 10, 10, 10])
397
+ subset_dict['overall'] = overall
398
+
399
+ # Add computed scores to the category
400
+ computed_subset_names = ['agentic', 'multi_turn', 'non_live', 'live', 'hallucination', 'overall']
401
+
402
+ # Add the computed scores as new subsets in the metric
403
+ dummy_subsets: List[Subset] = []
404
+ for subset_name in computed_subset_names:
405
+ if subset_name in subset_dict and subset_dict[subset_name].num > 0:
406
+ subset = subset_dict[subset_name]
407
+ subset.name = subset_name.upper()
408
+ dummy_subsets.append(subset)
409
+ dummy_category = Category(name='-', subsets=dummy_subsets)
410
+ metric.categories.append(dummy_category)
File without changes
@@ -0,0 +1,36 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
2
+ from evalscope.api.dataset import Sample
3
+ from evalscope.api.registry import register_benchmark
4
+ from evalscope.constants import Tags
5
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
6
+
7
+ DESCRIPTION = (
8
+ 'BiomixQA is a curated biomedical question-answering dataset. '
9
+ 'BiomixQA has been utilized to validate the Knowledge Graph based '
10
+ 'Retrieval-Augmented Generation (KG-RAG) framework across different LLMs.'
11
+ ) # noqa: E501
12
+
13
+
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='biomix_qa',
17
+ pretty_name='BioMixQA',
18
+ tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE, Tags.MEDICAL],
19
+ description=DESCRIPTION.strip(),
20
+ dataset_id='extraordinarylab/biomix-qa',
21
+ metric_list=['acc'],
22
+ few_shot_num=0,
23
+ train_split=None,
24
+ eval_split='test',
25
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
26
+ )
27
+ )
28
+ class BioMixQAAdapter(MultiChoiceAdapter):
29
+
30
+ def record_to_sample(self, record) -> Sample:
31
+ return Sample(
32
+ input=record['question'],
33
+ choices=record['choices'],
34
+ target=record['answer'],
35
+ metadata={},
36
+ )
File without changes
@@ -0,0 +1,61 @@
1
+ import re
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.io_utils import bytes_to_base64
10
+ from evalscope.utils.logger import get_logger
11
+ from evalscope.utils.multi_choices import format_letter_choices
12
+
13
+ logger = get_logger()
14
+
15
+ MULT_CHOICE_PROMPT = r"""
16
+ Answer the following multiple choice question. The last line of your response should be of the following format:
17
+ 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
18
+
19
+ {question}
20
+ """.strip()
21
+
22
+ SUBSET_LIST = [
23
+ 'Art_Style', 'Counting', 'Forensic_Detection', 'Functional_Correspondence', 'IQ_Test', 'Jigsaw',
24
+ 'Multi-view_Reasoning', 'Object_Localization', 'Relative_Depth', 'Relative_Reflectance', 'Semantic_Correspondence',
25
+ 'Spatial_Relation', 'Visual_Correspondence', 'Visual_Similarity'
26
+ ]
27
+
28
+
29
+ @register_benchmark(
30
+ BenchmarkMeta(
31
+ name='blink',
32
+ pretty_name='BLINK',
33
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
34
+ description=
35
+ 'BLINK is a benchmark designed to evaluate the core visual perception abilities of multimodal large language models (MLLMs). It transforms 14 classic computer vision tasks into 3,807 multiple-choice questions, accompanied by single or multiple images and visual prompts.', # noqa: E501
36
+ dataset_id='evalscope/BLINK',
37
+ subset_list=SUBSET_LIST,
38
+ metric_list=['acc'],
39
+ eval_split='val',
40
+ prompt_template=MULT_CHOICE_PROMPT,
41
+ )
42
+ )
43
+ class BLINKAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
44
+ MAX_IMAGES: int = 4
45
+
46
+ def __init__(self, **kwargs):
47
+ super().__init__(**kwargs)
48
+
49
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
50
+ choices = record.get('choices')
51
+ input_text = MULT_CHOICE_PROMPT.format(question=record['prompt'], letters=format_letter_choices(choices))
52
+ content_list: List[Content] = [ContentText(text=input_text)]
53
+
54
+ for i in range(1, self.MAX_IMAGES + 1):
55
+ image = record.get(f'image_{i}')
56
+ if image:
57
+ image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
58
+ content_list.append(ContentImage(image=image_base64))
59
+
60
+ label_answer = record['answer'].strip('(').strip(')')
61
+ return Sample(input=[ChatMessageUser(content=content_list)], choices=choices, target=label_answer)
@@ -1,10 +1,9 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from functools import partial
4
3
  from typing import Any, Dict
5
4
 
6
5
  from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
7
- from evalscope.api.dataset import Dataset, RemoteDataLoader, Sample
6
+ from evalscope.api.dataset import Sample
8
7
  from evalscope.api.registry import register_benchmark
9
8
  from evalscope.constants import Tags
10
9
  from evalscope.utils.logger import get_logger
File without changes
@@ -0,0 +1,80 @@
1
+ import re
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.metric.scorer import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.io_utils import bytes_to_base64
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ # flake8: noqa
15
+
16
+ logger = get_logger()
17
+
18
+ OPEN_PROMPT = """
19
+ {question}
20
+
21
+ The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the a single word answer to the problem.
22
+ """
23
+
24
+
25
+ @register_benchmark(
26
+ BenchmarkMeta(
27
+ name='chartqa',
28
+ pretty_name='ChartQA',
29
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
30
+ description=
31
+ 'ChartQA is a benchmark designed to evaluate question-answering capabilities about charts (e.g., bar charts, line graphs, pie charts), focusing on both visual and logical reasoning.', # noqa: E501
32
+ dataset_id='lmms-lab/ChartQA',
33
+ subset_list=['human_test', 'augmented_test'],
34
+ metric_list=['relaxed_acc'],
35
+ eval_split='test',
36
+ prompt_template=OPEN_PROMPT,
37
+ )
38
+ )
39
+ class ChartQAAdapter(VisionLanguageAdapter):
40
+
41
+ def __init__(self, *args, **kwargs):
42
+ super().__init__(*args, **kwargs)
43
+
44
+ self.add_aggregation_name = False
45
+ self.reformat_subset = True
46
+
47
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
48
+ question = record['question']
49
+ image_data = record['image']
50
+ image_base64 = bytes_to_base64(image_data['bytes'], format='png', add_header=True)
51
+
52
+ content_list: List[Content] = [
53
+ ContentText(text=OPEN_PROMPT.format(question=question)),
54
+ ContentImage(image=image_base64)
55
+ ]
56
+
57
+ return Sample(
58
+ input=[ChatMessageUser(content=content_list)],
59
+ target=record['answer'],
60
+ subset_key=record['type'], # 'human_test' or 'augmented_split'
61
+ )
62
+
63
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
64
+ pattern = r'ANSWER:\s*(.*)'
65
+ match = re.search(pattern, prediction)
66
+ if match:
67
+ return match.group(1).strip()
68
+ return ''
69
+
70
+ def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
71
+ from .utils import relaxed_correctness
72
+
73
+ score = relaxed_correctness(filtered_prediction, reference)
74
+ score = 1.0 if score else 0.0
75
+
76
+ return Score(
77
+ value={'relaxed_acc': score},
78
+ prediction=original_prediction,
79
+ extracted_prediction=filtered_prediction,
80
+ )
@@ -0,0 +1,38 @@
1
+ def relaxed_correctness(prediction: str, target: str, max_relative_change: float = 0.05) -> bool:
2
+ """Calculates relaxed correctness.
3
+
4
+ The correctness tolerates certain error ratio defined by max_relative_change.
5
+ See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
6
+ “Following Methani et al. (2020), we use a relaxed accuracy measure for the
7
+ numeric answers to allow a minor inaccuracy that may result from the automatic
8
+ data extraction process. We consider an answer to be correct if it is within
9
+ 5% of the gold answer. For non-numeric answers, we still need an exact match
10
+ to consider an answer to be correct.”
11
+
12
+ This funcion is taken from https://github.com/QwenLM/Qwen-VL/blob/34b4c0ee7b07726371b960911f249fe61b362ca3/eval_mm/evaluate_vqa.py#L113
13
+ Args:
14
+ target: List of target string.
15
+ prediction: List of predicted string.
16
+ max_relative_change: Maximum relative change.
17
+
18
+ Returns:
19
+ Whether the prediction was correct given the specified tolerance.
20
+ """ # noqa: E501
21
+
22
+ def _to_float(text: str):
23
+ try:
24
+ if text.endswith('%'):
25
+ # Convert percentages to floats.
26
+ return float(text.rstrip('%')) / 100.0
27
+ else:
28
+ return float(text)
29
+ except ValueError:
30
+ return None
31
+
32
+ prediction_float = _to_float(prediction)
33
+ target_float = _to_float(target)
34
+ if prediction_float is not None and target_float:
35
+ relative_change = abs(prediction_float - target_float) / abs(target_float)
36
+ return relative_change <= max_relative_change
37
+ else:
38
+ return prediction.lower() == target.lower()
File without changes