evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,544 @@
1
+ # flake8: noqa
2
+ import ast
3
+ import faulthandler
4
+ import json
5
+ import numpy as np
6
+ import platform
7
+
8
+ # to run the solution files we're using a timing based approach
9
+ import signal
10
+ import sys
11
+ import time
12
+
13
+ # used for debugging to time steps
14
+ from datetime import datetime
15
+ from decimal import Decimal
16
+ from enum import Enum
17
+ from functools import partial
18
+ from io import StringIO
19
+
20
+ # from pyext import RuntimeModule
21
+ from types import ModuleType
22
+
23
+ # used for testing the code that reads from input
24
+ from unittest.mock import mock_open, patch
25
+
26
+ from evalscope.utils.logger import get_logger
27
+
28
+ logger = get_logger()
29
+
30
+ import_string = 'from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(50000)\n'
31
+
32
+
33
+ def truncatefn(s, length=300):
34
+ if isinstance(s, str):
35
+ pass
36
+ else:
37
+ s = str(s)
38
+ if len(s) <= length:
39
+ return s
40
+
41
+ return s[:length // 2] + '...(truncated) ...' + s[-length // 2:]
42
+
43
+
44
+ class CODE_TYPE(Enum):
45
+ call_based = 0
46
+ standard_input = 1
47
+
48
+
49
+ # stuff for setting up signal timer
50
+ class TimeoutException(Exception):
51
+ pass
52
+
53
+
54
+ def timeout_handler(debug, signum, frame):
55
+ if debug:
56
+ logger.info('timeout occured: alarm went off')
57
+ raise TimeoutException
58
+
59
+
60
+ # used to capture stdout as a list
61
+ # from https://stackoverflow.com/a/16571630/6416660
62
+ # alternative use redirect_stdout() from contextlib
63
+ class Capturing(list):
64
+
65
+ def __enter__(self):
66
+ self._stdout = sys.stdout
67
+ sys.stdout = self._stringio = StringIO()
68
+ # Make closing the StringIO a no-op
69
+ self._stringio.close = lambda x: 1
70
+ return self
71
+
72
+ def __exit__(self, *args):
73
+ self.append(self._stringio.getvalue())
74
+ del self._stringio # free up some memory
75
+ sys.stdout = self._stdout
76
+
77
+
78
+ def clean_if_name(code: str) -> str:
79
+ try:
80
+ astree = ast.parse(code)
81
+ last_block = astree.body[-1]
82
+ if isinstance(last_block, ast.If):
83
+ condition = last_block.test
84
+ if ast.unparse(condition).strip() == "__name__ == '__main__'":
85
+ code = (
86
+ ast.unparse(astree.body[:-1]) + '\n' + ast.unparse(last_block.body) # type: ignore
87
+ )
88
+ except:
89
+ pass
90
+
91
+ return code
92
+
93
+
94
+ def make_function(code: str) -> str:
95
+ try:
96
+ import_stmts = []
97
+ all_other_stmts = []
98
+ astree = ast.parse(code)
99
+ for stmt in astree.body:
100
+ if isinstance(stmt, (ast.Import, ast.ImportFrom)):
101
+ import_stmts.append(stmt)
102
+ else:
103
+ all_other_stmts.append(stmt)
104
+
105
+ function_ast = ast.FunctionDef(
106
+ name='wrapped_function',
107
+ args=ast.arguments(posonlyargs=[], args=[], kwonlyargs=[], kw_defaults=[], defaults=[]),
108
+ body=all_other_stmts,
109
+ decorator_list=[],
110
+ lineno=-1,
111
+ )
112
+ main_code = (
113
+ import_string + '\n' + ast.unparse(import_stmts) # type: ignore
114
+ + '\n' + ast.unparse(function_ast) # type: ignore
115
+ )
116
+ return main_code
117
+ except Exception as e:
118
+ return code
119
+
120
+
121
+ def call_method(method, inputs):
122
+
123
+ if isinstance(inputs, list):
124
+ inputs = '\n'.join(inputs)
125
+
126
+ inputs_line_iterator = iter(inputs.split('\n'))
127
+
128
+ # sys.setrecursionlimit(10000)
129
+
130
+ # @patch('builtins.input', side_effect=inputs.split("\n"))
131
+ @patch('builtins.open', mock_open(read_data=inputs))
132
+ @patch('sys.stdin', StringIO(inputs))
133
+ @patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
134
+ @patch('sys.stdin.readlines', lambda *args: inputs.split('\n'))
135
+ @patch('sys.stdin.read', lambda *args: inputs)
136
+ # @patch('sys.stdout.write', print)
137
+ def _inner_call_method(_method):
138
+ try:
139
+ return _method()
140
+ except SystemExit as e:
141
+ pass
142
+ finally:
143
+ pass
144
+
145
+ return _inner_call_method(method)
146
+
147
+
148
+ def get_function(compiled_sol, fn_name: str): # type: ignore
149
+ try:
150
+ assert hasattr(compiled_sol, fn_name)
151
+ return getattr(compiled_sol, fn_name)
152
+ except Exception as e:
153
+ return
154
+
155
+
156
+ def compile_code(code: str, timeout: int):
157
+ signal.alarm(timeout)
158
+ try:
159
+ tmp_sol = ModuleType('tmp_sol', '')
160
+ exec(code, tmp_sol.__dict__)
161
+ if 'class Solution' in code:
162
+ # leetcode wraps solutions in `Solution`
163
+ # this is a hack to check if it is leetcode solution or not
164
+ # currently livecodebench only supports LeetCode but
165
+ # else condition allows future extensibility to other platforms
166
+ compiled_sol = tmp_sol.Solution()
167
+ else:
168
+ # do nothing in the other case since function is accesible
169
+ compiled_sol = tmp_sol
170
+
171
+ assert compiled_sol is not None
172
+ finally:
173
+ signal.alarm(0)
174
+
175
+ return compiled_sol
176
+
177
+
178
+ def convert_line_to_decimals(line: str) -> tuple[bool, list[Decimal]]:
179
+ try:
180
+ decimal_line = [Decimal(elem) for elem in line.split()]
181
+ except:
182
+ return False, []
183
+ return True, decimal_line
184
+
185
+
186
+ def get_stripped_lines(val: str):
187
+ ## you don't want empty lines to add empty list after splitlines!
188
+ val = val.strip()
189
+
190
+ return [val_line.strip() for val_line in val.split('\n')]
191
+
192
+
193
+ def grade_call_based(code: str, all_inputs: list, all_outputs: list, fn_name: str, timeout: int):
194
+ # call-based clean up logic
195
+ # need to wrap in try-catch logic after to catch the correct errors, but for now this is fine.
196
+ code = import_string + '\n\n' + code
197
+ compiled_sol = compile_code(code, timeout)
198
+
199
+ if compiled_sol is None:
200
+ return
201
+
202
+ method = get_function(compiled_sol, fn_name)
203
+
204
+ if method is None:
205
+ return
206
+
207
+ all_inputs = [[json.loads(line) for line in inputs.split('\n')] for inputs in all_inputs]
208
+
209
+ all_outputs = [json.loads(output) for output in all_outputs]
210
+
211
+ total_execution = 0
212
+ all_results = []
213
+ for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
214
+ signal.alarm(timeout)
215
+ # faulthandler.enable()
216
+ try:
217
+ # can lock here so time is useful
218
+ start = time.time()
219
+ prediction = method(*gt_inp)
220
+ total_execution += time.time() - start
221
+ signal.alarm(0)
222
+
223
+ # don't penalize model if it produces tuples instead of lists
224
+ # ground truth sequences are not tuples
225
+ if isinstance(prediction, tuple):
226
+ prediction = list(prediction)
227
+
228
+ tmp_result = prediction == gt_out
229
+
230
+ # handle floating point comparisons
231
+
232
+ all_results.append(tmp_result)
233
+
234
+ if not tmp_result:
235
+ return all_results, {
236
+ 'output': truncatefn(prediction),
237
+ 'inputs': truncatefn(gt_inp),
238
+ 'expected': truncatefn(gt_out),
239
+ 'error_code': -2,
240
+ 'error_message': 'Wrong Answer',
241
+ }
242
+ except Exception as e:
243
+ signal.alarm(0)
244
+ if 'timeoutexception' in repr(e).lower():
245
+ all_results.append(-3)
246
+ return all_results, {
247
+ 'error': repr(e),
248
+ 'error_code': -3,
249
+ 'error_message': 'Time Limit Exceeded',
250
+ 'inputs': truncatefn(gt_inp),
251
+ 'expected': truncatefn(gt_out),
252
+ }
253
+ else:
254
+ all_results.append(-4)
255
+ return all_results, {
256
+ 'error': repr(e),
257
+ 'error_code': -4,
258
+ 'error_message': 'Runtime Error',
259
+ 'inputs': truncatefn(gt_inp),
260
+ 'expected': truncatefn(gt_out),
261
+ }
262
+
263
+ finally:
264
+ signal.alarm(0)
265
+ # faulthandler.disable()
266
+
267
+ return all_results, {'execution time': total_execution}
268
+
269
+
270
+ def grade_stdio(
271
+ code: str,
272
+ all_inputs: list,
273
+ all_outputs: list,
274
+ timeout: int,
275
+ ):
276
+ ## runtime doesn't interact well with __name__ == '__main__'
277
+ code = clean_if_name(code)
278
+
279
+ ## we wrap the given code inside another function
280
+ code = make_function(code)
281
+
282
+ compiled_sol = compile_code(code, timeout)
283
+ if compiled_sol is None:
284
+ return
285
+
286
+ method = get_function(compiled_sol, 'wrapped_function')
287
+
288
+ if method is None:
289
+ return
290
+
291
+ all_results = []
292
+ total_execution_time = 0
293
+ for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
294
+ signal.alarm(timeout)
295
+ # faulthandler.enable()
296
+
297
+ with Capturing() as captured_output:
298
+ try:
299
+ start = time.time()
300
+ call_method(method, gt_inp)
301
+ total_execution_time += time.time() - start
302
+ # reset the alarm
303
+ signal.alarm(0)
304
+ except Exception as e:
305
+ signal.alarm(0)
306
+ if 'timeoutexception' in repr(e).lower():
307
+ all_results.append(-3)
308
+ return all_results, {
309
+ 'error': repr(e),
310
+ 'error_code': -3,
311
+ 'error_message': 'Time Limit Exceeded',
312
+ 'inputs': truncatefn(gt_inp),
313
+ 'expected': truncatefn(gt_out),
314
+ }
315
+ else:
316
+ all_results.append(-4)
317
+ return all_results, {
318
+ 'error': repr(e),
319
+ 'error_code': -4,
320
+ 'error_message': 'Runtime Error',
321
+ 'inputs': truncatefn(gt_inp),
322
+ 'expected': truncatefn(gt_out),
323
+ }
324
+
325
+ finally:
326
+ signal.alarm(0)
327
+ # faulthandler.disable()
328
+
329
+ prediction = captured_output[0]
330
+
331
+ stripped_prediction_lines = get_stripped_lines(prediction)
332
+ stripped_gt_out_lines = get_stripped_lines(gt_out)
333
+
334
+ ## WA happens in multiple circumstances
335
+ ## so cache the return to make it clean!
336
+ WA_send_args = {
337
+ 'output': truncatefn(prediction),
338
+ 'inputs': truncatefn(gt_inp),
339
+ 'expected': truncatefn(gt_out),
340
+ 'error_code': -2,
341
+ }
342
+
343
+ if len(stripped_prediction_lines) != len(stripped_gt_out_lines):
344
+ all_results.append(-2)
345
+ WA_send_args['error_message'] = 'Wrong answer: mismatched output length'
346
+ return all_results, WA_send_args
347
+
348
+ for output_line_idx, (
349
+ stripped_prediction_line,
350
+ stripped_gt_out_line,
351
+ ) in enumerate(zip(stripped_prediction_lines, stripped_gt_out_lines)):
352
+ WA_send_args['error_message'] = (
353
+ f'Wrong answer at {output_line_idx=}: {truncatefn(stripped_prediction_line)} != {truncatefn(stripped_gt_out_line)}'
354
+ )
355
+
356
+ ## CASE 1: exact match
357
+ if stripped_prediction_line == stripped_gt_out_line:
358
+ continue
359
+
360
+ ## CASE 2: element-wise comparision
361
+ ## if there are floating elements
362
+ ## use `decimal` library for good floating point comparision
363
+ ## otherwise gotcha: np.isclose(50000000000000000, 50000000000000001) = True
364
+ ## note that we should always be able to convert to decimals
365
+
366
+ success, decimal_prediction_line = convert_line_to_decimals(stripped_prediction_line)
367
+ if not success:
368
+ all_results.append(-2)
369
+ return all_results, WA_send_args
370
+ success, decimal_gtout_line = convert_line_to_decimals(stripped_gt_out_line)
371
+ if not success:
372
+ all_results.append(-2)
373
+ return all_results, WA_send_args
374
+
375
+ if decimal_prediction_line == decimal_gtout_line:
376
+ continue
377
+
378
+ all_results.append(-2)
379
+ return all_results, WA_send_args
380
+ all_results.append(True)
381
+
382
+ return all_results, {'execution time': total_execution_time}
383
+
384
+
385
+ def run_test(sample, test=None, debug=False, timeout=6):
386
+ """
387
+ if test(generated_code) is not None it'll try to run the code.
388
+ otherwise it'll just return an input and output pair.
389
+ """
390
+ timeout_handler_wrapper = partial(timeout_handler, debug)
391
+ signal.signal(signal.SIGALRM, timeout_handler_wrapper)
392
+
393
+ # Disable functionalities that can make destructive changes to the test.
394
+ # max memory is set to 4GB
395
+ reliability_guard()
396
+
397
+ if debug:
398
+ logger.info(f'start = {datetime.now().time()}')
399
+
400
+ try:
401
+ in_outs = json.loads(sample['input_output'])
402
+ except ValueError as e:
403
+ raise e
404
+ in_outs = None
405
+
406
+ if in_outs:
407
+ if in_outs.get('fn_name') is None:
408
+ which_type = CODE_TYPE.standard_input # Standard input
409
+ method_name = None
410
+
411
+ else:
412
+ which_type = CODE_TYPE.call_based # Call-based
413
+ method_name = in_outs['fn_name']
414
+
415
+ if debug:
416
+ logger.info(f'loaded input_output = {datetime.now().time()}')
417
+
418
+ if test is None:
419
+ assert False, 'should not happen: test code is none'
420
+ return in_outs, {'error': 'no test code provided'}
421
+ elif test is not None:
422
+ results = []
423
+ sol = import_string
424
+ if debug:
425
+ logger.info(f'loading test code = {datetime.now().time()}')
426
+
427
+ if which_type == CODE_TYPE.call_based:
428
+ signal.alarm(timeout)
429
+ try:
430
+ results, metadata = grade_call_based(
431
+ code=test,
432
+ all_inputs=in_outs['inputs'],
433
+ all_outputs=in_outs['outputs'],
434
+ fn_name=method_name,
435
+ timeout=timeout,
436
+ )
437
+ return results, metadata
438
+ except Exception as e:
439
+ return [-4], {
440
+ 'error_code': -4,
441
+ 'error_message': f'Error during testing: {e}',
442
+ }
443
+ finally:
444
+ signal.alarm(0)
445
+ elif which_type == CODE_TYPE.standard_input:
446
+ # sol
447
+ # if code has if __name__ == "__main__": then remove it
448
+
449
+ signal.alarm(timeout)
450
+ try:
451
+ results, metadata = grade_stdio(
452
+ code=test,
453
+ all_inputs=in_outs['inputs'],
454
+ all_outputs=in_outs['outputs'],
455
+ timeout=timeout,
456
+ )
457
+ return results, metadata
458
+ except Exception as e:
459
+ return [-4], {
460
+ 'error_code': -4,
461
+ 'error_message': f'Error during testing: {e}',
462
+ }
463
+ finally:
464
+ signal.alarm(0)
465
+
466
+
467
+ def reliability_guard(maximum_memory_bytes=None):
468
+ """
469
+ This disables various destructive functions and prevents the generated code
470
+ from interfering with the test (e.g. fork bomb, killing other processes,
471
+ removing filesystem files, etc.)
472
+ WARNING
473
+ This function is NOT a security sandbox. Untrusted code, including, model-
474
+ generated code, should not be blindly executed outside of one. See the
475
+ Codex paper for more information about OpenAI's code sandbox, and proceed
476
+ with caution.
477
+ """
478
+
479
+ if maximum_memory_bytes is not None:
480
+ import resource
481
+
482
+ resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
483
+ resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
484
+ if not platform.uname().system == 'Darwin':
485
+ resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
486
+
487
+ # faulthandler.disable()
488
+
489
+ import builtins
490
+
491
+ # builtins.exit = None
492
+ builtins.quit = None
493
+
494
+ import os
495
+
496
+ os.environ['OMP_NUM_THREADS'] = '1'
497
+
498
+ os.kill = None
499
+ os.system = None
500
+ os.putenv = None
501
+ os.remove = None
502
+ os.removedirs = None
503
+ os.rmdir = None
504
+ os.fchdir = None
505
+ os.setuid = None
506
+ os.fork = None
507
+ os.forkpty = None
508
+ os.killpg = None
509
+ os.rename = None
510
+ os.renames = None
511
+ os.truncate = None
512
+ os.replace = None
513
+ os.unlink = None
514
+ os.fchmod = None
515
+ os.fchown = None
516
+ os.chmod = None
517
+ os.chown = None
518
+ os.chroot = None
519
+ os.fchdir = None
520
+ os.lchflags = None
521
+ os.lchmod = None
522
+ os.lchown = None
523
+ os.getcwd = None
524
+ os.chdir = None
525
+
526
+ import shutil
527
+
528
+ shutil.rmtree = None
529
+ shutil.move = None
530
+ shutil.chown = None
531
+
532
+ import subprocess
533
+
534
+ subprocess.Popen = None # type: ignore
535
+
536
+ __builtins__['help'] = None
537
+
538
+ import sys
539
+
540
+ sys.modules['ipdb'] = None
541
+ sys.modules['joblib'] = None
542
+ sys.modules['resource'] = None
543
+ sys.modules['psutil'] = None
544
+ sys.modules['tkinter'] = None
File without changes
@@ -0,0 +1,41 @@
1
+ # flake8: noqa: E501
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
7
+
8
+ DESCRIPTION = 'LogiQA is a dataset sourced from expert-written questions for testing human Logical reasoning.'
9
+
10
+ PROMPT_TEMPLATE = r"""
11
+ Answer the following multiple choice question. The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
12
+
13
+ {question}
14
+
15
+ {choices}
16
+ """.strip()
17
+
18
+
19
+ @register_benchmark(
20
+ BenchmarkMeta(
21
+ name='logi_qa',
22
+ pretty_name='LogiQA',
23
+ tags=[Tags.REASONING, Tags.MULTIPLE_CHOICE],
24
+ description=DESCRIPTION.strip(),
25
+ dataset_id='extraordinarylab/logiqa',
26
+ metric_list=['acc'],
27
+ few_shot_num=0,
28
+ train_split='validation',
29
+ eval_split='test',
30
+ prompt_template=PROMPT_TEMPLATE,
31
+ )
32
+ )
33
+ class LogiQAAdapter(MultiChoiceAdapter):
34
+
35
+ def record_to_sample(self, record) -> Sample:
36
+ return Sample(
37
+ input=f"{record['context']}\n{record['question']}",
38
+ choices=record['choices'],
39
+ target=record['answer'],
40
+ metadata={},
41
+ )
File without changes
@@ -0,0 +1,56 @@
1
+ from typing import Any
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
7
+
8
+ MARITIME_PROMPT_TEMPLATE = '请回答单选题。要求只输出选项,不输出解释,将选项放在[]里,直接输出答案。示例:\n\n题目:在船舶主推进动力装置中,传动轴系在运转中承受以下复杂的应力和负荷,但不包括______。\n选项:\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答:[A]\n 当前题目\n {question}\n选项:\n{choices}' # noqa: E501
9
+
10
+
11
+ @register_benchmark(
12
+ BenchmarkMeta(
13
+ name='maritime_bench',
14
+ pretty_name='MaritimeBench',
15
+ tags=[Tags.CHINESE, Tags.MULTIPLE_CHOICE, Tags.KNOWLEDGE],
16
+ description=
17
+ 'MaritimeBench is a benchmark for evaluating AI models on maritime-related multiple-choice questions. It consists of questions related to maritime knowledge, where the model must select the correct answer from given options.', # noqa: E501
18
+ dataset_id='HiDolphin/MaritimeBench',
19
+ metric_list=['acc'],
20
+ few_shot_num=0,
21
+ eval_split='test',
22
+ prompt_template=MARITIME_PROMPT_TEMPLATE,
23
+ )
24
+ )
25
+ class MaritimeBenchAdapter(MultiChoiceAdapter):
26
+
27
+ def __init__(self, **kwargs):
28
+ super().__init__(**kwargs)
29
+
30
+ self.reformat_subset = True
31
+
32
+ def record_to_sample(self, record) -> Sample:
33
+ # Extract available choices from the record
34
+ choices = []
35
+ choice_letters = ['A', 'B', 'C', 'D']
36
+ for letter in choice_letters:
37
+ if letter in record and record[letter]:
38
+ choices.append(record[letter])
39
+
40
+ return Sample(
41
+ input=record['question'],
42
+ choices=choices,
43
+ target=record['answer'],
44
+ )
45
+
46
+ def format_prompt_template(self, sample):
47
+ choices = '\n'.join([f'{chr(65 + i)}. {choice}' for i, choice in enumerate(sample.choices)])
48
+ return MARITIME_PROMPT_TEMPLATE.format(question=sample.input, choices=choices)
49
+
50
+ def extract_answer(self, prediction, task_state):
51
+ # use regex to extract the answer from the prediction
52
+ import re
53
+ match = re.search(r'\[([A-D])\]', prediction)
54
+ if match:
55
+ return match.group(1)
56
+ return ''
File without changes
@@ -0,0 +1,55 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from typing import Any, Dict
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='math_500',
17
+ pretty_name='MATH-500',
18
+ tags=[Tags.MATH, Tags.REASONING],
19
+ description=
20
+ "MATH-500 is a benchmark for evaluating mathematical reasoning capabilities of AI models. It consists of 500 diverse math problems across five levels of difficulty, designed to test a model's ability to solve complex mathematical problems by generating step-by-step solutions and providing the correct final answer.", # noqa: E501
21
+ dataset_id='AI-ModelScope/MATH-500',
22
+ subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
23
+ metric_list=[{
24
+ 'acc': {
25
+ 'numeric': True
26
+ }
27
+ }],
28
+ few_shot_num=0,
29
+ train_split=None,
30
+ eval_split='test',
31
+ prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
32
+ )
33
+ )
34
+ class Math500Adapter(DefaultDataAdapter):
35
+
36
+ def __init__(self, *args, **kwargs):
37
+ super().__init__(*args, **kwargs)
38
+
39
+ self.reformat_subset = True
40
+
41
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
42
+ return Sample(
43
+ input=record['problem'],
44
+ target=record['answer'],
45
+ subset_key=f"Level {record['level']}",
46
+ metadata={
47
+ 'question_id': record['unique_id'],
48
+ 'solution': record['solution'],
49
+ },
50
+ )
51
+
52
+ def extract_answer(self, prediction: str, task_state):
53
+ from evalscope.metrics.math_parser import extract_answer
54
+
55
+ return extract_answer(prediction)
File without changes