evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1,159 @@
1
+ import ast
2
+ import re
3
+ from typing import Any, Dict, List
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.evaluator import TaskState
8
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.io_utils import bytes_to_base64
12
+ from evalscope.utils.logger import get_logger
13
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
14
+
15
+ # flake8: noqa
16
+
17
+ logger = get_logger()
18
+
19
+ SUBSET_LIST = [
20
+ 'Accounting',
21
+ 'Agriculture',
22
+ 'Architecture_and_Engineering',
23
+ 'Art',
24
+ 'Art_Theory',
25
+ 'Basic_Medical_Science',
26
+ 'Biology',
27
+ 'Chemistry',
28
+ 'Clinical_Medicine',
29
+ 'Computer_Science',
30
+ 'Design',
31
+ 'Diagnostics_and_Laboratory_Medicine',
32
+ 'Economics',
33
+ 'Electronics',
34
+ 'Energy_and_Power',
35
+ 'Finance',
36
+ 'Geography',
37
+ 'History',
38
+ 'Literature',
39
+ 'Manage',
40
+ 'Marketing',
41
+ 'Materials',
42
+ 'Math',
43
+ 'Mechanical_Engineering',
44
+ 'Music',
45
+ 'Pharmacy',
46
+ 'Physics',
47
+ 'Psychology',
48
+ 'Public_Health',
49
+ 'Sociology',
50
+ ]
51
+
52
+ MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
53
+
54
+ OPEN_PROMPT = """
55
+ Solve the following problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
56
+
57
+ {question}
58
+
59
+ Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command.
60
+ """
61
+
62
+ MULTI_CHOICE_TYPE = 'multiple-choice'
63
+ OPEN_TYPE = 'open'
64
+
65
+
66
+ @register_benchmark(
67
+ BenchmarkMeta(
68
+ name='mmmu',
69
+ pretty_name='MMMU',
70
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
71
+ description=
72
+ 'MMMU (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI) benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning. MMMU includes 11.5K meticulously collected multimodal questions from college exams, quizzes, and textbooks, covering six core disciplines: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering. These questions span 30 subjects and 183 subfields, comprising 30 highly heterogeneous image types, such as charts, diagrams, maps, tables, music sheets, and chemical structures.', # noqa: E501
73
+ dataset_id='AI-ModelScope/MMMU',
74
+ subset_list=SUBSET_LIST,
75
+ metric_list=['acc'],
76
+ eval_split='validation',
77
+ prompt_template=OPEN_PROMPT,
78
+ )
79
+ )
80
+ class MMMUAdapter(VisionLanguageAdapter):
81
+ MAX_IMAGES: int = 7
82
+
83
+ def __init__(self, *args, **kwargs):
84
+ super().__init__(*args, **kwargs)
85
+
86
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
87
+ question_type = record['question_type']
88
+ content_list, answers_list = MMMUAdapter.create_content_and_answers_list(record)
89
+
90
+ metadata = {
91
+ 'id': record['id'],
92
+ 'question_type': record['question_type'],
93
+ 'subfield': record['subfield'],
94
+ 'explanation': record['explanation'],
95
+ 'img_type': record['img_type'],
96
+ 'topic_difficulty': record['topic_difficulty'],
97
+ }
98
+
99
+ if question_type == MULTI_CHOICE_TYPE:
100
+ return Sample(
101
+ input=[ChatMessageUser(content=content_list)],
102
+ choices=answers_list,
103
+ target=record['answer'],
104
+ metadata=metadata,
105
+ )
106
+ elif question_type == OPEN_TYPE:
107
+ return Sample(
108
+ input=[ChatMessageUser(content=content_list)],
109
+ target=record['answer'],
110
+ metadata=metadata,
111
+ )
112
+ else:
113
+ raise ValueError(f'Unsupported question type: {question_type}')
114
+
115
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
116
+ question_type = task_state.metadata['question_type']
117
+ if question_type == MULTI_CHOICE_TYPE:
118
+ answers = parse_answers(task_state)
119
+ return ''.join(sorted(list(answers)))
120
+ elif question_type == OPEN_TYPE:
121
+ pattern = r'ANSWER:\s*(.*)'
122
+ match = re.search(pattern, prediction)
123
+ if match:
124
+ return match.group(1).strip()
125
+ return ''
126
+ else:
127
+ raise ValueError(f'Unsupported question type: {question_type}')
128
+
129
+ @staticmethod
130
+ def create_content_and_answers_list(record: Dict[str, Any]) -> tuple[List[Content], List[str]]:
131
+ """
132
+ Create a list of content elements and a list of answers from a record.
133
+
134
+ Args:
135
+ record (dict): The record containing question, images, and options.
136
+
137
+
138
+ Returns:
139
+ tuple: A tuple containing:
140
+ - content_list (list): A list of content elements (text and images).
141
+ - answers_list (list): A list of possible answers (for multiple-choice questions).
142
+ """
143
+ question_type = record['question_type']
144
+
145
+ if question_type == MULTI_CHOICE_TYPE:
146
+ answers_list: List[str] = ast.literal_eval(record['options'])
147
+ input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
148
+ content_list: List[Content] = [ContentText(text=input_text)]
149
+ else:
150
+ answers_list: List[str] = []
151
+ content_list: List[Content] = [ContentText(text=OPEN_PROMPT.format(question=record['question']))]
152
+
153
+ for i in range(MMMUAdapter.MAX_IMAGES):
154
+ image = record[f'image_{i+1}']
155
+ if image:
156
+ image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
157
+ content_list.append(ContentImage(image=image_base64))
158
+
159
+ return content_list, answers_list
File without changes
@@ -0,0 +1,129 @@
1
+ import ast
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.io_utils import bytes_to_base64
11
+ from evalscope.utils.logger import get_logger
12
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate, answer_character, parse_answers, prompt
13
+
14
+ logger = get_logger()
15
+
16
+ SUBSET_LIST = [
17
+ 'Accounting',
18
+ 'Agriculture',
19
+ 'Architecture_and_Engineering',
20
+ 'Art',
21
+ 'Art_Theory',
22
+ 'Basic_Medical_Science',
23
+ 'Biology',
24
+ 'Chemistry',
25
+ 'Clinical_Medicine',
26
+ 'Computer_Science',
27
+ 'Design',
28
+ 'Diagnostics_and_Laboratory_Medicine',
29
+ 'Economics',
30
+ 'Electronics',
31
+ 'Energy_and_Power',
32
+ 'Finance',
33
+ 'Geography',
34
+ 'History',
35
+ 'Literature',
36
+ 'Manage',
37
+ 'Marketing',
38
+ 'Materials',
39
+ 'Math',
40
+ 'Mechanical_Engineering',
41
+ 'Music',
42
+ 'Pharmacy',
43
+ 'Physics',
44
+ 'Psychology',
45
+ 'Public_Health',
46
+ 'Sociology',
47
+ ]
48
+
49
+ MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
50
+
51
+ VISION_PROMPT = r"""
52
+ Answer the following multiple choice question in image. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}. Think step by step before answering.
53
+
54
+ """.strip() # noqa: E501
55
+
56
+ DATASET_FORMATS = ['standard (4 options)', 'standard (10 options)', 'vision']
57
+
58
+
59
+ @register_benchmark(
60
+ BenchmarkMeta(
61
+ name='mmmu_pro',
62
+ pretty_name='MMMU-PRO',
63
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
64
+ description=
65
+ 'MMMU-Pro is an enhanced multimodal benchmark designed to rigorously assess the true understanding capabilities of advanced AI models across multiple modalities. It builds upon the original MMMU benchmark by introducing several key improvements that make it more challenging and realistic, ensuring that models are evaluated on their genuine ability to integrate and comprehend both visual and textual information.', # noqa: E501
66
+ dataset_id='AI-ModelScope/MMMU_Pro',
67
+ subset_list=SUBSET_LIST,
68
+ metric_list=['acc'],
69
+ eval_split='test',
70
+ prompt_template=MULT_CHOICE_PROMPT,
71
+ extra_params={
72
+ 'dataset_format': f"# choose from {DATASET_FORMATS}, default 'standard (4 options)'",
73
+ }
74
+ )
75
+ )
76
+ class MMMUPROAdapter(VisionLanguageAdapter):
77
+ MAX_IMAGES: int = 7
78
+
79
+ def __init__(self, *args, **kwargs):
80
+ super().__init__(*args, **kwargs)
81
+
82
+ self.reformat_subset = True
83
+ self.dataset_format = self.extra_params.get('dataset_format', 'standard (4 options)')
84
+ if self.dataset_format not in DATASET_FORMATS:
85
+ logger.warning(f"Invalid dataset_format '{self.dataset_format}', fallback to 'standard (4 options)'")
86
+ self.dataset_format = 'standard (4 options)'
87
+ self.default_subset = self.dataset_format
88
+
89
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
90
+
91
+ metadata = {
92
+ 'id': record['id'],
93
+ 'explanation': record.get('explanation'),
94
+ 'img_type': record.get('img_type'),
95
+ 'topic_difficulty': record.get('topic_difficulty'),
96
+ 'subject': record.get('subject')
97
+ }
98
+
99
+ answers_list: List[str] = ast.literal_eval(record['options'])
100
+
101
+ if self.dataset_format == 'vision':
102
+ letters = ','.join(answer_character(i) for i in range(len(answers_list)))
103
+ input_text = VISION_PROMPT.format(letters=letters)
104
+ content_list: List[Content] = [ContentText(text=input_text)]
105
+
106
+ image = record.get('image')
107
+ if image:
108
+ content_list.append(ContentImage(image=bytes_to_base64(image['bytes'], format='png', add_header=True)))
109
+ else:
110
+ input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
111
+ content_list: List[Content] = [ContentText(text=input_text)]
112
+
113
+ for i in range(MMMUPROAdapter.MAX_IMAGES):
114
+ image = record.get(f'image_{i+1}')
115
+ if image:
116
+ image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
117
+ content_list.append(ContentImage(image=image_base64))
118
+
119
+ return Sample(
120
+ input=[ChatMessageUser(content=content_list)],
121
+ choices=answers_list,
122
+ target=record['answer'],
123
+ subset_key=record['subject'],
124
+ metadata=metadata,
125
+ )
126
+
127
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
128
+ answers = parse_answers(task_state)
129
+ return ''.join(sorted(list(answers)))
@@ -1,74 +1,43 @@
1
1
  import ast
2
2
  from typing import Any
3
3
 
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.constants import EvalType, OutputType
6
- from evalscope.metrics import exact_match
7
- from evalscope.metrics.completion_parsers import ResponseParser
8
-
9
-
10
- @Benchmark.register(
11
- name='musr',
12
- pretty_name='MuSR',
13
- tags=['Reasoning', 'MCQ'],
14
- description=
15
- 'MuSR is a benchmark for evaluating AI models on multiple-choice questions related to murder mysteries, object placements, and team allocation.', # noqa: E501
16
- dataset_id='AI-ModelScope/MuSR',
17
- model_adapter=OutputType.GENERATION,
18
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
19
- subset_list=['murder_mysteries', 'object_placements', 'team_allocation'],
20
- metric_list=['AverageAccuracy'],
21
- few_shot_num=0,
22
- train_split=None,
23
- eval_split='test',
24
- prompt_template=
25
- '{narrative}\n\n{question}\n\n{choices}\nThink step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.', # noqa: E501
4
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.registry import register_benchmark
7
+ from evalscope.constants import Tags
8
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
9
+
10
+
11
+ @register_benchmark(
12
+ BenchmarkMeta(
13
+ name='musr',
14
+ pretty_name='MuSR',
15
+ tags=[Tags.REASONING, Tags.MULTIPLE_CHOICE],
16
+ description=
17
+ 'MuSR is a benchmark for evaluating AI models on multiple-choice questions related to murder mysteries, object placements, and team allocation.', # noqa: E501
18
+ dataset_id='AI-ModelScope/MuSR',
19
+ metric_list=['acc'],
20
+ subset_list=['murder_mysteries', 'object_placements', 'team_allocation'],
21
+ few_shot_num=0,
22
+ train_split=None,
23
+ eval_split='test',
24
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
25
+ )
26
26
  )
27
- class MuSRAdapter(DataAdapter):
27
+ class MuSRAdapter(MultiChoiceAdapter):
28
28
 
29
29
  def __init__(self, **kwargs):
30
30
  super().__init__(**kwargs)
31
31
 
32
- self.choices = ['A', 'B', 'C', 'D', 'E', 'F']
33
-
34
- def load(self, **kwargs):
35
- # default load all levels
36
- kwargs['split_as_subset'] = True
37
- data_dict = super().load(**kwargs)
38
- return data_dict
39
-
40
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
41
-
42
- choices = self.format_choice(ast.literal_eval(input_d['choices']))
43
-
44
- full_prompt = self.prompt_template.format(
45
- narrative=input_d['narrative'], question=input_d['question'], choices=choices)
46
-
47
- return self.gen_prompt_data(full_prompt)
48
-
49
- def format_choice(self, options: list):
50
- option_str = ''
51
- for opt, choice in zip(options, self.choices):
52
- option_str += f'({choice}): {opt}\n'
53
- return option_str
54
-
55
- def get_gold_answer(self, input_d: dict) -> str:
56
- """
57
- Parse the raw input labels (gold).
58
- """
59
- return self.choices[input_d['answer_index']]
32
+ self.split_as_subset = True
60
33
 
61
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
62
- """
63
- Parse the predicted result and extract proper answer.
64
- """
65
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
66
- return result
67
- else:
68
- return ResponseParser.parse_first_option(result, options=self.choices)
34
+ def record_to_sample(self, record) -> Sample:
35
+ choices = ast.literal_eval(record['choices'])
36
+ choice_letters = ['A', 'B', 'C', 'D', 'E', 'F']
37
+ target_letter = choice_letters[record['answer_index']]
69
38
 
70
- def match(self, gold: str, pred: str) -> float:
71
- """
72
- Match the gold answer and the predicted answer.
73
- """
74
- return exact_match(gold=gold, pred=pred)
39
+ return Sample(
40
+ input=f"{record['narrative']}\n\n{record['question']}",
41
+ choices=choices,
42
+ target=target_letter,
43
+ )