evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,80 +1,14 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import csv
3
- import os
4
2
 
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType
7
- from evalscope.metrics import AverageAccuracy, exact_match
8
- from evalscope.models import MultiChoiceModelAdapter
9
- from evalscope.utils import ResponseParser, normalize_score
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
10
7
  from evalscope.utils.logger import get_logger
11
-
12
- # flake8: noqa
8
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
13
9
 
14
10
  logger = get_logger()
15
11
 
16
- DATASET_ID = 'modelscope/mmlu'
17
-
18
- SUBSET_LIST = [
19
- 'high_school_european_history',
20
- 'business_ethics',
21
- 'clinical_knowledge',
22
- 'medical_genetics',
23
- 'high_school_us_history',
24
- 'high_school_physics',
25
- 'high_school_world_history',
26
- 'virology',
27
- 'high_school_microeconomics',
28
- 'econometrics',
29
- 'college_computer_science',
30
- 'high_school_biology',
31
- 'abstract_algebra',
32
- 'professional_accounting',
33
- 'philosophy',
34
- 'professional_medicine',
35
- 'nutrition',
36
- 'global_facts',
37
- 'machine_learning',
38
- 'security_studies',
39
- 'public_relations',
40
- 'professional_psychology',
41
- 'prehistory',
42
- 'anatomy',
43
- 'human_sexuality',
44
- 'college_medicine',
45
- 'high_school_government_and_politics',
46
- 'college_chemistry',
47
- 'logical_fallacies',
48
- 'high_school_geography',
49
- 'elementary_mathematics',
50
- 'human_aging',
51
- 'college_mathematics',
52
- 'high_school_psychology',
53
- 'formal_logic',
54
- 'high_school_statistics',
55
- 'international_law',
56
- 'high_school_mathematics',
57
- 'high_school_computer_science',
58
- 'conceptual_physics',
59
- 'miscellaneous',
60
- 'high_school_chemistry',
61
- 'marketing',
62
- 'professional_law',
63
- 'management',
64
- 'college_physics',
65
- 'jurisprudence',
66
- 'world_religions',
67
- 'sociology',
68
- 'us_foreign_policy',
69
- 'high_school_macroeconomics',
70
- 'computer_security',
71
- 'moral_scenarios',
72
- 'moral_disputes',
73
- 'electrical_engineering',
74
- 'astronomy',
75
- 'college_biology',
76
- ]
77
-
78
12
  SUBJECT_MAPPING = {
79
13
  'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
80
14
  'anatomy': ['Anatomy', 'health', 'Other'],
@@ -136,150 +70,38 @@ SUBJECT_MAPPING = {
136
70
  }
137
71
 
138
72
 
139
- @Benchmark.register(
140
- name='mmlu',
141
- dataset_id='modelscope/mmlu',
142
- model_adapter=MultiChoiceModelAdapter,
143
- subset_list=SUBSET_LIST,
144
- metric_list=[AverageAccuracy],
145
- few_shot_num=5,
146
- train_split='train',
147
- eval_split='test',
148
- prompt_template='',
73
+ @register_benchmark(
74
+ BenchmarkMeta(
75
+ name='mmlu',
76
+ pretty_name='MMLU',
77
+ tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
78
+ description=
79
+ "The MMLU (Massive Multitask Language Understanding) benchmark is a comprehensive evaluation suite designed to assess the performance of language models across a wide range of subjects and tasks. It includes multiple-choice questions from various domains, such as history, science, mathematics, and more, providing a robust measure of a model's understanding and reasoning capabilities.", # noqa: E501
80
+ dataset_id='cais/mmlu',
81
+ metric_list=['acc'],
82
+ subset_list=list(SUBJECT_MAPPING.keys()),
83
+ default_subset='all',
84
+ few_shot_num=5,
85
+ train_split='dev',
86
+ eval_split='test',
87
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
88
+ )
149
89
  )
150
- class MMLUAdapter(DataAdapter):
151
-
152
- choices = ['A', 'B', 'C', 'D']
90
+ class MMLUAdapter(MultiChoiceAdapter):
153
91
 
154
92
  def __init__(self, **kwargs):
155
93
 
156
- few_shot_num = kwargs.get('few_shot_num', 5)
157
- if few_shot_num > 5:
158
- logger.warning(f'few_shot_num <= 5 for MMLU, but got {few_shot_num}. Use 5-shot by default.')
159
- kwargs['few_shot_num'] = 5
160
-
161
94
  super().__init__(**kwargs)
162
95
 
96
+ self.reformat_subset = True
163
97
  self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
164
98
 
165
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
166
- data_dict = {}
167
- for subset_name in subset_list:
168
- data_dict[subset_name] = {}
169
-
170
- for split_name in [self.train_split, self.eval_split]:
171
- if split_name == 'train':
172
- split_name_suffix = 'dev'
173
- elif split_name == 'test':
174
- split_name_suffix = 'test'
175
- elif split_name == 'validation':
176
- split_name_suffix = 'val'
177
- else:
178
- raise ValueError(f'Invalid split name: {split_name}')
179
-
180
- if os.path.exists(dataset_name_or_path):
181
- file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name_suffix}.csv')
182
- else:
183
- file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name_suffix}.csv')
184
-
185
- if os.path.exists(file_path):
186
- with open(file_path, encoding='utf-8') as f:
187
- rows = []
188
- reader = csv.reader(f)
189
- for row in reader:
190
- if len(row) != 6:
191
- logger.error(f'Mismatch len of row: {row}, len of row should be 6. Skip this row.')
192
- continue
193
- rows.append({
194
- 'input': row[0],
195
- 'A': row[1],
196
- 'B': row[2],
197
- 'C': row[3],
198
- 'D': row[4],
199
- 'target': row[5],
200
- })
201
-
202
- data_dict[subset_name].update({split_name: rows})
203
-
204
- return data_dict
205
-
206
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
207
- """
208
- Generate model prompt from raw input, unify the prompt format for MMLU benchmark.
209
-
210
- Args:
211
- input_d (dict): The raw input. A single data format of the MMLU:
212
-
213
- {'input': '___________ is based on the idea that customer expectations of the service they will receive shape their perception of the actual service encounter.',
214
- 'A': 'Service quality.',
215
- 'B': 'Service action.',
216
- 'C': 'Service recovery.',
217
- 'D': 'Service satisfaction.',
218
- 'target': 'A'}
219
-
220
- Returns:
221
- {'data': [full_prompt], 'multi_choices': self.choices}
222
-
223
- """
224
- prompt = 'The following are multiple choice questions (with answers) about {}.\n\n'.format(
225
- self._format_subject(subset_name))
226
- few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
227
-
228
- context: str = '\n'.join(few_shot_prompts) + '\n'
229
- context += self._generate_prompt(input_d=input_d, include_answer=False)
230
- context = prompt + context
231
-
232
- full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
233
-
234
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
235
-
236
- def get_gold_answer(self, input_d: dict) -> str:
237
- # Get the gold choice
238
- return input_d.get('target', '')
239
-
240
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
241
- """
242
- Parse the model output to get the answer. Could be the best choice index.
243
-
244
- Args:
245
- result: Predicted answer from the model. Usually a string for chat.
246
- raw_input_d: The raw input. Depending on the dataset.
247
- eval_type: 'checkpoint' or 'service' or 'custom'
248
-
249
- Returns:
250
- The parsed answer. Depending on the dataset. Usually a string for chat.
251
- """
252
- if eval_type == EvalType.CHECKPOINT:
253
- return result
254
- elif eval_type == EvalType.SERVICE:
255
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
256
- elif eval_type == EvalType.CUSTOM:
257
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
258
- else:
259
- raise ValueError(f'Invalid eval_type: {eval_type}')
260
-
261
- def match(self, gold: str, pred: str) -> float:
262
- return exact_match(gold=gold, pred=pred)
263
-
264
- @classmethod
265
- def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
266
-
267
- input_choices: list = [input_d['A'], input_d['B'], input_d['C'], input_d['D']]
268
-
269
- example: str = input_d['input']
270
- for j in range(len(cls.choices)):
271
- example += '\n{}. {}'.format(cls.choices[j], input_choices[j])
272
-
273
- example += '\nAnswer:'
274
- if include_answer:
275
- example += ' {}\n\n'.format(input_d['target'])
276
-
277
- return example
278
-
279
- @classmethod
280
- def _format_subject(cls, subject):
281
- l = subject.split('_')
282
- s = ''
283
- for entry in l:
284
- s += ' ' + entry
285
- return s
99
+ def record_to_sample(self, record) -> Sample:
100
+ return Sample(
101
+ input=record['question'],
102
+ choices=record['choices'],
103
+ # converts 0 -> A, 1 -> B, etc.
104
+ target=('ABCD'[record['answer']]),
105
+ subset_key=record['subject'],
106
+ metadata={'subject': record['subject']},
107
+ )
@@ -1,110 +1,94 @@
1
- from collections import defaultdict
2
1
  from typing import Any, Dict
3
2
 
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.constants import AnswerKeys, EvalType
6
- from evalscope.metrics import AverageAccuracy, exact_match
7
- from evalscope.models import ChatGenerationModelAdapter
8
- from evalscope.utils.utils import ResponseParser
9
-
10
-
11
- @Benchmark.register(
12
- name='mmlu_pro',
13
- dataset_id='modelscope/mmlu-pro',
14
- model_adapter=ChatGenerationModelAdapter,
15
- subset_list=['default'],
16
- metric_list=[AverageAccuracy],
17
- few_shot_num=5,
18
- train_split='validation',
19
- eval_split='test',
20
- prompt_template=
21
- 'You are an knowledge expert, you are supposed to answer the multi-choice question to derive your final answer as `The answer is ...`.', # noqa: E501
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ logger = get_logger()
10
+
11
+ # Based on the prompt provided here:
12
+ # https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/mmlu_pro
13
+ SYSTEM_W_EXAMPLES_PROMPT_TEMPLATE = """
14
+ The following are multiple choice questions (with answers) about {subject}. Think step by step and then finish your answer with 'ANSWER: $LETTER' (without quotes) where LETTER is the correct letter choice.
15
+
16
+ {examples}
17
+ """.lstrip() # noqa: E501
18
+
19
+ # Based on MultipleChoiceTemplate.SINGLE_ANSWER provided in the multiple choice solver:
20
+ # https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/src/inspect_ai/solver/_multiple_choice.py
21
+ USER_PROMPT_TEMPLATE = """Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}. Think step by step before answering.
22
+
23
+ Question:
24
+ {question}
25
+ Options:
26
+ {choices}
27
+ """.lstrip() # noqa: E501
28
+
29
+ SUBSET_LIST = [
30
+ 'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
31
+ 'philosophy', 'economics', 'other', 'psychology', 'history'
32
+ ]
33
+
34
+
35
+ @register_benchmark(
36
+ BenchmarkMeta(
37
+ name='mmlu_pro',
38
+ pretty_name='MMLU-Pro',
39
+ tags=[Tags.MULTIPLE_CHOICE, Tags.KNOWLEDGE],
40
+ description=
41
+ 'MMLU-Pro is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.', # noqa: E501
42
+ dataset_id='TIGER-Lab/MMLU-Pro',
43
+ subset_list=SUBSET_LIST,
44
+ metric_list=['acc'],
45
+ few_shot_num=5,
46
+ train_split='validation',
47
+ eval_split='test',
48
+ prompt_template=USER_PROMPT_TEMPLATE,
49
+ few_shot_prompt_template=SYSTEM_W_EXAMPLES_PROMPT_TEMPLATE + USER_PROMPT_TEMPLATE,
50
+ )
22
51
  )
23
- class MMLUProAdapter(DataAdapter):
52
+ class MMLUProAdapter(MultiChoiceAdapter):
24
53
 
25
54
  def __init__(self, **kwargs):
26
55
  super().__init__(**kwargs)
27
56
 
28
- self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
29
- self.categories = [
30
- 'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
31
- 'philosophy', 'economics', 'other', 'psychology', 'history'
32
- ]
33
-
34
- def gen_prompts(self, data_dict: dict, **kwargs) -> Dict[str, list]:
35
- """
36
- Generate model prompt from raw input, unify the prompt format for MMLU-Pro benchmark.
37
- Return a dict with category as key and list of prompts as value.
38
- """
39
-
40
- data_dict = data_dict[self.subset_list[0]] # Only one subset for MMLU-Pro
41
- fewshot_prompts = self.get_fewshot_examples(data_dict)
42
-
43
- # Use the category as key to group the prompts
44
- res_dict = defaultdict(list)
45
- # generate prompts for each test sample
46
- for entry in data_dict[self.eval_split]:
47
- prefix = fewshot_prompts[entry['category']]
48
- query = prefix + 'Q: ' + entry['question'] + '\n' + \
49
- self.__form_options(entry['options']) + '\n'
50
-
51
- prompt_d = {'data': [query], 'system_prompt': self.prompt_template, AnswerKeys.RAW_INPUT: entry}
52
-
53
- res_dict[entry['category']].append(prompt_d)
54
- return res_dict
55
-
56
- def get_fewshot_examples(self, data_dict: dict):
57
- # load 5-shot prompts for each category
58
- prompts = {c: '' for c in self.categories}
59
- for d in data_dict[self.train_split]:
60
- prompts[d['category']] += 'Q:' + ' ' + d['question'] + '\n' + \
61
- self.__form_options(d['options']) + '\n' + \
62
- d['cot_content'] + '\n\n'
63
- return prompts
64
-
65
- def __form_options(self, options: list):
66
- option_str = 'Options are:\n'
67
- for opt, choice in zip(options, self.choices):
68
- option_str += f'({choice}): {opt}' + '\n'
69
- return option_str
70
-
71
- def get_gold_answer(self, input_d: dict) -> str:
72
- """
73
- Parse the raw input labels (gold).
74
-
75
- Args:
76
- input_d: input raw data. Depending on the dataset.
77
-
78
- Returns:
79
- The parsed input. e.g. gold answer ... Depending on the dataset.
80
- """
81
- return input_d['answer']
82
-
83
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
84
- """
85
- Parse the predicted result and extract proper answer.
86
-
87
- Args:
88
- result: Predicted answer from the model. Usually a string for chat.
89
- raw_input_d: The raw input. Depending on the dataset.
90
- eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
91
-
92
- Returns:
93
- The parsed answer. Depending on the dataset. Usually a string for chat.
94
- """
95
- return ResponseParser.parse_first_option(result)
96
-
97
- def match(self, gold: str, pred: str) -> float:
98
- """
99
- Match the gold answer and the predicted answer.
100
-
101
- Args:
102
- gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
103
- e.g. 'A', extracted from get_gold_answer method.
104
- pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
105
- e.g. 'B', extracted from parse_pred_result method.
106
-
107
- Returns:
108
- The match result. Usually a score (float) for chat/multiple-choice-questions.
109
- """
110
- return exact_match(gold=gold, pred=pred)
57
+ self.reformat_subset = True
58
+
59
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
60
+ return Sample(
61
+ input=record['question'],
62
+ choices=record['options'],
63
+ target=record['answer'],
64
+ subset_key=record['category'].lower(),
65
+ metadata={
66
+ 'cot_content': record['cot_content'],
67
+ 'subject': record['category'].lower(),
68
+ 'question_id': record['question_id'],
69
+ },
70
+ )
71
+
72
+ def sample_to_fewshot(self, sample: Sample) -> str:
73
+ q_str = f"""Question:\n{str(sample.input)}"""
74
+ options = sample.choices if sample.choices is not None else []
75
+ opt_str_list = []
76
+ for i, opt in enumerate(options):
77
+ opt_str_list.append(f"""{chr(65 + i)} {opt}""")
78
+ opt_str = '\n'.join(opt_str_list)
79
+ opt_str = f"""Options:\n{opt_str}"""
80
+ ans_str = sample.metadata['cot_content'] if sample.metadata is not None else ''
81
+ ans_str = ans_str.replace('The answer is', 'ANSWER:')
82
+ ans_opt = ans_str.split('ANSWER:')[-1].split('.')[0].strip().strip('(').strip(')')
83
+ ans_str = ans_str.replace(f'ANSWER: ({ans_opt})', f'ANSWER: {ans_opt}')
84
+ final_str = '\n'.join([q_str, opt_str, ans_str])
85
+
86
+ return final_str
87
+
88
+ def format_fewshot_template(self, fewshot, sample):
89
+ fewshot_str = SYSTEM_W_EXAMPLES_PROMPT_TEMPLATE.format(
90
+ subject=sample.metadata['subject'],
91
+ examples=fewshot,
92
+ )
93
+ prompt_str = self.format_prompt_template(sample)
94
+ return fewshot_str + '\n' + prompt_str
File without changes
@@ -0,0 +1,139 @@
1
+ from typing import Any, Dict
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
7
+ from evalscope.utils.logger import get_logger
8
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
9
+
10
+ logger = get_logger()
11
+
12
+ SUBJECT_MAPPING = {
13
+ 'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
14
+ 'anatomy': ['Anatomy', 'health', 'Other'],
15
+ 'astronomy': ['Astronomy', 'physics', 'STEM'],
16
+ 'business_ethics': ['Business Ethics', 'business', 'Other'],
17
+ 'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
18
+ 'college_biology': ['College Biology', 'biology', 'STEM'],
19
+ 'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
20
+ 'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
21
+ 'college_mathematics': ['College Mathematics', 'math', 'STEM'],
22
+ 'college_medicine': ['College Medicine', 'health', 'Other'],
23
+ 'college_physics': ['College Physics', 'physics', 'STEM'],
24
+ 'computer_security': ['Computer Security', 'computer science', 'STEM'],
25
+ 'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
26
+ 'econometrics': ['Econometrics', 'economics', 'Social Science'],
27
+ 'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
28
+ 'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
29
+ 'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
30
+ 'global_facts': ['Global Facts', 'other', 'Other'],
31
+ 'high_school_biology': ['High School Biology', 'biology', 'STEM'],
32
+ 'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
33
+ 'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
34
+ 'high_school_european_history': ['High School European History', 'history', 'Humanities'],
35
+ 'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
36
+ 'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
37
+ 'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
38
+ 'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
39
+ 'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
40
+ 'high_school_physics': ['High School Physics', 'physics', 'STEM'],
41
+ 'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
42
+ 'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
43
+ 'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
44
+ 'high_school_world_history': ['High School World History', 'history', 'Humanities'],
45
+ 'human_aging': ['Human Aging', 'health', 'Other'],
46
+ 'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
47
+ 'international_law': ['International Law', 'law', 'Humanities'],
48
+ 'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
49
+ 'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
50
+ 'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
51
+ 'management': ['Management', 'business', 'Other'],
52
+ 'marketing': ['Marketing', 'business', 'Other'],
53
+ 'medical_genetics': ['Medical Genetics', 'health', 'Other'],
54
+ 'miscellaneous': ['Miscellaneous', 'other', 'Other'],
55
+ 'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
56
+ 'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
57
+ 'nutrition': ['Nutrition', 'health', 'Other'],
58
+ 'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
59
+ 'prehistory': ['Prehistory', 'history', 'Humanities'],
60
+ 'professional_accounting': ['Professional Accounting', 'other', 'Other'],
61
+ 'professional_law': ['Professional Law', 'law', 'Humanities'],
62
+ 'professional_medicine': ['Professional Medicine', 'health', 'Other'],
63
+ 'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
64
+ 'public_relations': ['Public Relations', 'politics', 'Social Science'],
65
+ 'security_studies': ['Security Studies', 'politics', 'Social Science'],
66
+ 'sociology': ['Sociology', 'culture', 'Social Science'],
67
+ 'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
68
+ 'virology': ['Virology', 'health', 'Other'],
69
+ 'world_religions': ['World Religions', 'philosophy', 'Humanities'],
70
+ }
71
+
72
+ SUBSET_LIST = list(SUBJECT_MAPPING.keys())
73
+
74
+
75
+ @register_benchmark(
76
+ BenchmarkMeta(
77
+ name='mmlu_redux',
78
+ pretty_name='MMLU-Redux',
79
+ tags=[Tags.MULTIPLE_CHOICE, Tags.KNOWLEDGE],
80
+ description=
81
+ 'MMLU-Redux is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options. ' # noqa: E501
82
+ 'The bad answers are corrected.', # noqa: E501
83
+ dataset_id='AI-ModelScope/mmlu-redux-2.0',
84
+ subset_list=SUBSET_LIST,
85
+ metric_list=[{
86
+ 'acc': {
87
+ 'allow_inclusion': True
88
+ }
89
+ }],
90
+ few_shot_num=0,
91
+ train_split=None,
92
+ eval_split='test',
93
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
94
+ )
95
+ )
96
+ class MMLUReduxAdapter(MultiChoiceAdapter):
97
+
98
+ def __init__(self, **kwargs):
99
+ super().__init__(**kwargs)
100
+
101
+ if self.few_shot_num > 0:
102
+ self.few_shot_num = 0
103
+ logger.warning('Few-shot examples are not supported for MMLU-Redux dataset. Setting few_shot_num to 0.')
104
+
105
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
106
+ error_type = record['error_type']
107
+ choices = record['choices']
108
+ target_index_list = [int(record['answer'])]
109
+ correct_answer = record['correct_answer']
110
+ if error_type == 'no_correct_answer' and correct_answer:
111
+ choices[target_index_list[0]] = correct_answer
112
+ elif error_type == 'wrong_groundtruth' and correct_answer:
113
+ try:
114
+ target_index_list = [int(correct_answer)]
115
+ except ValueError:
116
+ choice_index = ord(correct_answer) - ord('A')
117
+ target_index_list = [choice_index]
118
+ elif error_type == 'multiple_correct_answers' and correct_answer:
119
+ correct_answer = correct_answer.strip('()')
120
+ try:
121
+ correct_answer = correct_answer.replace(' and ', ',').replace(' or ', ',')
122
+ target_index_list = list(map(int, correct_answer.split(',')))
123
+ except ValueError:
124
+ try:
125
+ target_index_list = [ord(c) - ord('A') for c in correct_answer.split(',')]
126
+ except TypeError:
127
+ # find the index of the correct answer in choices
128
+ target_index_list = [choices.index(c) for c in correct_answer.split(',') if c in choices]
129
+
130
+ return Sample(
131
+ input=record['question'],
132
+ choices=choices,
133
+ target=['ABCD'[i] for i in target_index_list] if target_index_list else ['A', 'B', 'C', 'D'],
134
+ metadata={
135
+ 'error_type': error_type,
136
+ 'correct_answer': correct_answer,
137
+ 'potential_reason': record.get('potential_reason', ''),
138
+ },
139
+ )
File without changes