evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,149 @@
1
+ # flake8: noqa: E501
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.metric import AggScore, SampleScore, Score
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+ GRADER_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\".""" # noqa: E501
15
+
16
+ GRADER_TEMPLATE = """<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>""".strip(
17
+ )
18
+
19
+
20
+ @register_benchmark(
21
+ BenchmarkMeta(
22
+ name='arena_hard',
23
+ pretty_name='ArenaHard',
24
+ tags=[Tags.INSTRUCTION_FOLLOWING, Tags.ARENA],
25
+ description=
26
+ 'ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, '
27
+ 'where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. '
28
+ 'It includes a set of challenging tasks that require reasoning, understanding, and generation capabilities. '
29
+ 'Currently not support `style-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-0314`.',
30
+ dataset_id='AI-ModelScope/arena-hard-auto-v0.1',
31
+ metric_list=['winrate'],
32
+ aggregation='elo',
33
+ few_shot_num=0,
34
+ train_split=None,
35
+ eval_split='test',
36
+ prompt_template='{question}'
37
+ )
38
+ )
39
+ class ArenaHardAdapter(DefaultDataAdapter):
40
+
41
+ def __init__(self, *args, **kwargs):
42
+ super().__init__(*args, **kwargs)
43
+
44
+ self._use_llm_judge = True # Use LLM as a judge by default
45
+
46
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
47
+ """
48
+ Convert a data record to a Sample object.
49
+
50
+ Args:
51
+ record (Dict[str, Any]): Input data record.
52
+
53
+ Returns:
54
+ Sample: Sample object with input, target, and metadata.
55
+ """
56
+ question = record['question']
57
+ baseline_prediction = record['prediction'] # baseline model prediction
58
+
59
+ return Sample(
60
+ input=question, target=baseline_prediction, metadata={'capability': record.get('capability', 'unknown')}
61
+ )
62
+
63
+ def llm_match_score(
64
+ self,
65
+ original_prediction: str,
66
+ filtered_prediction: str,
67
+ reference: str,
68
+ task_state: TaskState,
69
+ ) -> Score:
70
+ from .utils import get_judge_score, post_process_arenahard
71
+
72
+ score = Score(
73
+ extracted_prediction=filtered_prediction,
74
+ prediction=original_prediction,
75
+ )
76
+
77
+ question = task_state.input_text
78
+
79
+ # reference is baseline answer 'A', filtered_prediction is model answer 'B'
80
+ prompt1 = GRADER_TEMPLATE.format(question=question, answer_1=reference, answer_2=filtered_prediction)
81
+ # reverse the order
82
+ prompt2 = GRADER_TEMPLATE.format(question=question, answer_1=filtered_prediction, answer_2=reference)
83
+
84
+ # get grading response
85
+ game1_response = self.llm_judge.judge(prompt1, system_prompt=GRADER_SYSTEM_PROMPT)
86
+ game2_response = self.llm_judge.judge(prompt2, system_prompt=GRADER_SYSTEM_PROMPT)
87
+
88
+ # parse grading response
89
+ res1 = post_process_arenahard(game1_response)
90
+ res2 = post_process_arenahard(game2_response)
91
+
92
+ score1 = get_judge_score(res1, reverse=True)
93
+ score2 = get_judge_score(res2, reverse=False)
94
+
95
+ battle_result = {
96
+ 'model_a':
97
+ 'gpt4-0314',
98
+ 'model_b':
99
+ 'test_model',
100
+ 'games': [
101
+ {
102
+ 'user_prompt': prompt1,
103
+ 'judgment': game1_response,
104
+ 'score': res1
105
+ },
106
+ {
107
+ 'user_prompt': prompt2,
108
+ 'judgment': game2_response,
109
+ 'score': res2
110
+ },
111
+ ]
112
+ }
113
+
114
+ # Set score based on the battle result
115
+ score.value = {'score': (score1 + score2) / 2}
116
+ score.explanation = f'LLM judge battles: Game1: {game1_response[:100]}... Game2: {game2_response[:100]}...'
117
+ score.metadata = {
118
+ 'source': 'llm_judge',
119
+ 'judge_strategy': self.judge_strategy,
120
+ 'model': self.llm_judge.model_id,
121
+ 'battle_result': battle_result
122
+ }
123
+ return score
124
+
125
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
126
+ import pandas as pd
127
+
128
+ from .utils import compute_mle_elo, get_battles_from_row, get_bootstrap_result, get_win_rate_column
129
+
130
+ battles = pd.concat([get_battles_from_row(res.score.metadata['battle_result']) for res in sample_scores])
131
+
132
+ bootstrap_online_elo = compute_mle_elo(battles)
133
+
134
+ stats = pd.DataFrame()
135
+ stats['results'] = None
136
+ stats['results'] = stats['results'].astype('object')
137
+
138
+ for i, model in enumerate(bootstrap_online_elo.index):
139
+ # assert model in bootstrap_elo_lu.columns
140
+ stats.at[i, 'model'] = model
141
+ stats.at[i, 'score'] = bootstrap_online_elo[model]
142
+
143
+ score = get_win_rate_column(stats, 'score', 'gpt4-0314').at['test_model']
144
+
145
+ return [AggScore(
146
+ score=score,
147
+ metric_name='winrate',
148
+ num=len(sample_scores),
149
+ )]
@@ -0,0 +1,186 @@
1
+ import math
2
+ import numpy as np
3
+ import pandas as pd
4
+ import re
5
+ from collections import defaultdict
6
+ from sklearn.linear_model import LogisticRegression
7
+ from tqdm import tqdm
8
+
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ def post_process_arenahard(completion):
15
+ result = re.findall(r'\[\[([AB<>=]+)\]\]', completion)
16
+ if result:
17
+ return result[0]
18
+ else:
19
+ return None
20
+
21
+
22
+ def get_judge_score(result, reverse=False):
23
+ """
24
+ Calculate the judge score, considering confidence weight.
25
+
26
+ Args:
27
+ result: Judgment result ('A=B', 'A>B', 'A>>B', 'B>A', 'B>>A')
28
+ reverse: Whether to reverse the score
29
+
30
+ Returns:
31
+ float: Weighted score
32
+ """
33
+
34
+ # Base score mapping - using finer-grained scores
35
+ if not reverse:
36
+ score_mapping = {
37
+ 'A=B': 0.5, # Tie
38
+ 'A>B': 0.75, # A slightly wins
39
+ 'A>>B': 1.0, # A significantly wins
40
+ 'B>A': 0.25, # B slightly wins
41
+ 'B>>A': 0.0, # B significantly wins
42
+ }
43
+ else:
44
+ score_mapping = {
45
+ 'A=B': 0.5, # Tie
46
+ 'A>B': 0.25, # A slightly wins
47
+ 'A>>B': 0.0, # A significantly wins
48
+ 'B>A': 0.75, # B slightly wins
49
+ 'B>>A': 1.0, # B significantly wins
50
+ }
51
+
52
+ base_score = score_mapping.get(result, 0.5)
53
+
54
+ return base_score
55
+
56
+
57
+ def get_battles_from_row(row, first_game_only=False, multiplier=3):
58
+ results = []
59
+ output = {'model_a': row['model_a'], 'model_b': row['model_b']}
60
+
61
+ game = row['games'][0]
62
+ weight = 1
63
+ if game['score'] == 'A=B':
64
+ output['winner'] = 'tie'
65
+ elif game['score'] == 'A>B':
66
+ output['winner'] = 'model_a'
67
+ elif game['score'] == 'A>>B':
68
+ output['winner'] = 'model_a'
69
+ weight = multiplier
70
+ elif game['score'] == 'B>A':
71
+ output['winner'] = 'model_b'
72
+ elif game['score'] == 'B>>A':
73
+ output['winner'] = 'model_b'
74
+ weight = multiplier
75
+ else:
76
+ weight = 0
77
+
78
+ if weight:
79
+ results += [output] * weight
80
+
81
+ if first_game_only:
82
+ return pd.DataFrame(results)
83
+
84
+ # game 2
85
+ output = {'model_a': row['model_a'], 'model_b': row['model_b']}
86
+
87
+ game = row['games'][1]
88
+
89
+ weight = 1
90
+ if game['score'] == 'A=B':
91
+ output['winner'] = 'tie'
92
+ elif game['score'] == 'A>B':
93
+ output['winner'] = 'model_b'
94
+ elif game['score'] == 'A>>B':
95
+ output['winner'] = 'model_b'
96
+ weight = multiplier
97
+ elif game['score'] == 'B>A':
98
+ output['winner'] = 'model_a'
99
+ elif game['score'] == 'B>>A':
100
+ output['winner'] = 'model_a'
101
+ weight = multiplier
102
+ else:
103
+ weight = 0
104
+
105
+ if weight:
106
+ results += [output] * weight
107
+
108
+ return pd.DataFrame(results)
109
+
110
+
111
+ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
112
+ models = pd.concat([df['model_a'], df['model_b']]).unique()
113
+ models = pd.Series(np.arange(len(models)), index=models)
114
+
115
+ # duplicate battles
116
+ df = pd.concat([df, df], ignore_index=True)
117
+ p = len(models.index)
118
+ n = df.shape[0]
119
+
120
+ X = np.zeros([n, p])
121
+ X[np.arange(n), models[df['model_a']]] = +math.log(BASE)
122
+ X[np.arange(n), models[df['model_b']]] = -math.log(BASE)
123
+
124
+ # one A win => two A win
125
+ Y = np.zeros(n)
126
+ Y[df['winner'] == 'model_a'] = 1.0
127
+
128
+ # one tie => one A win + one B win
129
+ # find tie + tie (both bad) index
130
+ tie_idx = (df['winner'] == 'tie') | (df['winner'] == 'tie (bothbad)')
131
+ tie_idx[len(tie_idx) // 2:] = False
132
+ Y[tie_idx] = 1.0
133
+
134
+ if len(np.unique(Y)) < 2:
135
+ logger.info('Warning: Only one class in the data')
136
+ elo_scores = pd.Series(INIT_RATING, index=models.index)
137
+ if np.all(Y == 1.0):
138
+ elo_scores[df['model_a'].iloc[0]] += SCALE # Boost the winning model
139
+ elif np.all(Y == 0.0):
140
+ elo_scores[df['model_b'].iloc[0]] += SCALE # Boost the winning model
141
+ return elo_scores.sort_values(ascending=False)
142
+
143
+ lr = LogisticRegression(
144
+ fit_intercept=False, penalty=None, tol=1e-8
145
+ ) # May need to set a small value when not use GPT4 as judge model
146
+ lr.fit(X, Y)
147
+
148
+ elo_scores = SCALE * lr.coef_[0] + INIT_RATING
149
+
150
+ # set anchor as gpt4-0314 = 1000
151
+ if 'gpt4-0314' in models.index:
152
+ elo_scores += 1000 - elo_scores[models['gpt4-0314']]
153
+ return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
154
+
155
+
156
+ def get_bootstrap_result(battles, func_compute_elo, num_round):
157
+ rows = []
158
+ for _ in tqdm(range(num_round), desc='bootstrap'):
159
+ res = func_compute_elo(battles.sample(frac=1.0, replace=True))
160
+ if res is not None:
161
+ rows.append(res)
162
+ df = pd.DataFrame(rows)
163
+ return df[df.median().sort_values(ascending=False).index]
164
+
165
+
166
+ def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
167
+ names = sorted(list(elo_ratings.keys()))
168
+ wins = defaultdict(lambda: defaultdict(lambda: 0))
169
+ for a in names:
170
+ for b in names:
171
+ ea = 1 / (1 + BASE**((elo_ratings[b] - elo_ratings[a]) / SCALE))
172
+ wins[a][b] = ea
173
+ wins[b][a] = 1 - ea
174
+
175
+ data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}
176
+
177
+ df = pd.DataFrame(data, index=names)
178
+ df.index.name = 'model_a'
179
+ df.columns.name = 'model_b'
180
+ return df.T
181
+
182
+
183
+ def get_win_rate_column(df, column, baseline='gpt4-0314'):
184
+ to_dict = df[['model', column]].set_index('model').to_dict()[column]
185
+ win_rate_table = predict_win_rate(to_dict)
186
+ return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x, 4))
@@ -1,19 +1,16 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import json
4
3
  import os
5
- import random
6
4
  import re
5
+ from typing import Any, Dict
7
6
 
8
- from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.constants import AnswerKeys
10
- from evalscope.metrics import AverageAccuracy, exact_match
11
- from evalscope.models.chat_adapter import ChatGenerationModelAdapter
12
- from evalscope.utils import ResponseParser
7
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
8
+ from evalscope.api.dataset import Sample
9
+ from evalscope.api.evaluator import TaskState
10
+ from evalscope.api.registry import register_benchmark
11
+ from evalscope.constants import Tags
13
12
  from evalscope.utils.logger import get_logger
14
13
 
15
- # flake8: noqa
16
-
17
14
  logger = get_logger()
18
15
 
19
16
  # BBH multiple choice subset list
@@ -57,185 +54,148 @@ FREE_FORM_LIST = [
57
54
  TASK_TYPE = 'task_type'
58
55
  SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
59
56
 
60
-
61
- @Benchmark.register(
62
- name='bbh',
63
- dataset_id='modelscope/bbh',
64
- model_adapter=ChatGenerationModelAdapter,
65
- subset_list=SUBSET_LIST,
66
- metric_list=[AverageAccuracy],
67
- few_shot_num=3,
68
- train_split=None,
69
- eval_split='test',
70
- prompt_template='',
57
+ PROMPT_TEMPLATE = """
58
+ Q: {question}
59
+ A: Let's think step by step. Put your final answer in the format of "So the answer is $ANSWER" (without quotes and markdown) where $ANSWER is the answer to the problem.
60
+ """.lstrip() # noqa: E501
61
+
62
+ FEWSHOT_TEMPLATE = """
63
+ {fewshot}
64
+
65
+ """.lstrip() + PROMPT_TEMPLATE
66
+
67
+
68
+ @register_benchmark(
69
+ BenchmarkMeta(
70
+ name='bbh',
71
+ pretty_name='BBH',
72
+ dataset_id='evalscope/bbh',
73
+ tags=[Tags.REASONING],
74
+ description=
75
+ 'The BBH (Big Bench Hard) benchmark is a collection of challenging tasks designed to evaluate the reasoning capabilities of AI models. It includes both free-form and multiple-choice tasks, covering a wide range of reasoning skills.', # noqa: E501
76
+ subset_list=SUBSET_LIST,
77
+ few_shot_num=3,
78
+ train_split=None,
79
+ eval_split='test',
80
+ metric_list=['acc'],
81
+ prompt_template=PROMPT_TEMPLATE,
82
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
83
+ )
71
84
  )
72
- class BBHAdapter(DataAdapter):
85
+ class BBHAdapter(DefaultDataAdapter):
73
86
  """
74
87
  Adapter for BBH free-form and multiple-choices sub-tasks.
75
88
  """
76
89
 
77
90
  def __init__(self, **kwargs):
78
-
79
91
  few_shot_num = kwargs.get('few_shot_num', 3)
80
92
 
81
93
  if few_shot_num != 3 and few_shot_num != 0:
82
- logger.error(f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
83
- f'Use 3-shot by default.')
94
+ logger.error(
95
+ f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
96
+ f'Use 3-shot by default.'
97
+ )
84
98
  kwargs['few_shot_num'] = 3
85
99
 
86
100
  super().__init__(**kwargs)
87
101
 
88
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
89
- data_dict = {}
90
- for subset_name in subset_list:
91
- for split_name in [self.eval_split]:
92
- if os.path.exists(dataset_name_or_path):
93
- file_path = os.path.join(dataset_name_or_path, f'{subset_name}.json')
94
- else:
95
- file_path: str = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}.json')
96
- if os.path.exists(file_path):
97
- with open(file_path, 'r') as f:
98
- examples = json.load(f)['examples']
99
- if subset_name in data_dict:
100
- data_dict[subset_name].update({split_name: examples})
101
- else:
102
- data_dict[subset_name] = {split_name: examples}
103
-
104
- return data_dict
105
-
106
- def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
107
- """
108
- Generate model prompt from raw data, unify the prompt format for bbh(multiple choice) benchmark.
109
-
110
- Args:
111
- input_d (dict): The raw input. A single data format of the BBH:
112
-
113
- {
114
- 'input': '((-1 + 2 + 9 * 5) - (-2 + -4 + -4 * -7)) =',
115
- 'target': '24',
116
- }
117
-
118
- Returns:
119
- {'data': ['xxx']}
120
- """
121
- # few_shot_list: should be ['xxxx']
122
- cot_prompts: str = few_shot_list[0] if len(few_shot_list) > 0 else ''
123
- full_prompt: str = f"Follow the given examples and answer the question.\n{cot_prompts}\n\nQ: {input_d['input']}\nA: Let's think step by step."
124
-
125
- return {'data': [full_prompt], 'system_prompt': self.prompt_template}
126
-
127
- def gen_prompts(self, data_dict: dict) -> dict:
128
- """
129
- Generate dataset prompts from raw input, unify the prompt format for different datasets.
130
-
131
- Args:
132
- data_dict: Refer to the output of load method: evalscope.benchmarks.benchmark.Benchmark.load
133
-
134
- Returns:
135
- {'subset_name': [prompt_d_1, prompt_d_2, ...]}
136
- prompt_d_i (dict): refer to the output of gen_prompt method.
137
-
138
- e.g. train -- few-shot data, test -- target dataset to evaluate.
139
- """
140
- res_dict: dict = {}
141
-
142
- if self.few_shot_num < 0:
143
- raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
144
-
145
- logger.info(f'Use default settings: '
146
- f'> few_shot_num: {self.few_shot_num}, '
147
- f'> few_shot_split: {self.train_split}, '
148
- f'> target_eval_split: {self.eval_split}')
149
-
150
- for sub_name, sub_data_dict in data_dict.items():
151
- few_shot_data = []
152
- if self.few_shot_num > 0:
153
- with open(os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{sub_name}.txt'), 'r') as f:
154
- cot_prompt_str = f.read()
155
- few_shot_data = [cot_prompt_str]
156
-
157
- res_dict[sub_name] = []
158
- for sample_d in sub_data_dict[self.eval_split]:
159
- prompt_d = self.gen_prompt(input_d=sample_d, few_shot_list=few_shot_data)
160
- sample_d_new = sample_d.copy()
161
- if sub_name in MULTIPLE_CHOICE_LIST:
162
- sample_d_new[TASK_TYPE] = MULTIPLE_CHOICE
163
- elif sub_name in FREE_FORM_LIST:
164
- sample_d_new[TASK_TYPE] = FREE_FORM
165
- else:
166
- raise ValueError(f'Invalid subset name: {sub_name}')
167
-
168
- prompt_d[AnswerKeys.RAW_INPUT] = sample_d_new
169
- res_dict[sub_name].append(prompt_d)
170
-
171
- rnd = random.Random()
172
- rnd.seed(42)
173
- for k, v in res_dict.items():
174
- rnd.shuffle(v)
175
-
176
- return res_dict
177
-
178
- def get_gold_answer(self, input_d: dict) -> str:
179
- # Get the gold choice
180
- gold = input_d.get('target')
181
- if gold is None:
182
- logger.error(f'BBHAdapter: gold is None.')
183
- return gold
184
-
185
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
186
- """
187
- Parse the model output to get the answer. Could be the best choice index.
188
-
189
- Args:
190
- result: Predicted answer from the model. Usually a string for chat.
191
- raw_input_d (dict): The raw input. Depending on the dataset.
192
- eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
193
-
194
- Returns:
195
- The parsed answer. Depending on the dataset. Usually a string for chat.
196
- """
197
- # Note: to use same extraction method for both of checkpoint/service/custom.
198
- task_type: str = raw_input_d.get(TASK_TYPE)
102
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
103
+ input = record['input']
104
+ target = record['target'].replace('(', '').replace(')', '').strip() # Clean up the target answer
105
+
106
+ # Determine task type based on subset name
107
+ task_type = None
108
+ subset_name = self.current_subset_name
109
+ if subset_name in MULTIPLE_CHOICE_LIST:
110
+ task_type = MULTIPLE_CHOICE
111
+ elif subset_name in FREE_FORM_LIST:
112
+ task_type = FREE_FORM
113
+
114
+ metadata = {TASK_TYPE: task_type}
115
+
116
+ return Sample(input=input, target=target, metadata=metadata, subset_key=subset_name)
117
+
118
+ def format_fewshot_template(self, fewshot: str, sample: Sample) -> str:
119
+ # Load CoT prompts from file for BBH
120
+ subset_name = sample.subset_key
121
+ if subset_name:
122
+ cot_file_path = os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{subset_name}.txt')
123
+ if os.path.exists(cot_file_path):
124
+ with open(cot_file_path, 'r', encoding='utf-8') as f:
125
+ fewshot = f.read().strip()
126
+ return self.few_shot_prompt_template.format(
127
+ fewshot=fewshot,
128
+ question=sample.input,
129
+ )
130
+
131
+ def extract_answer(self, prediction: str, task_state: TaskState):
132
+ task_type = task_state.metadata.get(TASK_TYPE)
199
133
 
200
134
  if task_type == MULTIPLE_CHOICE:
201
- return self._extract_mc_answer(result)
135
+ return self._extract_mc_answer(prediction)
202
136
  elif task_type == FREE_FORM:
203
- return self._extract_ff_answer(result)
137
+ return self._extract_ff_answer(prediction)
204
138
  else:
205
- raise ValueError(f'Invalid task type: {task_type}')
206
-
207
- def match(self, gold: str, pred: str) -> float:
208
- return exact_match(gold=gold, pred=pred)
139
+ return prediction.strip()
209
140
 
210
141
  @classmethod
211
142
  def _extract_mc_answer(cls, ans: str) -> str:
212
143
  """
213
- Extract the answer from the model output for Multiple choice task.
144
+ Extract normalized answer for BBH multiple-choice tasks.
145
+ Handles formats like:
146
+ - "answer is (A)"
147
+ - "The answer is A."
148
+ - Extra text after answer.
149
+ Always uses the *last* occurrence of "answer is".
214
150
  """
215
- ans_line = ans.split('answer is ')
216
- if len(ans_line) != 1:
217
- ans = ans_line[1].strip()
218
- match = re.search(r'\(([A-Z])\)*', ans)
151
+ ans = ans.strip()
152
+
153
+ parts = ans.split('So the answer is ')
154
+ if len(parts) > 1:
155
+ ans = parts[-1].strip()
156
+ ans = ans.split('\n')[0].strip()
157
+
158
+ # Remove trailing period
159
+ if ans.endswith('.'):
160
+ ans = ans[:-1].strip()
161
+
162
+ # Capture uppercase letter inside parentheses (A) (B) ...
163
+ match = re.search(r'\(([A-Z])\)', ans)
219
164
  if match:
220
165
  return match.group(1)
221
- match = re.search(r'([A-Z])', ans)
166
+
167
+ # Capture single uppercase letter
168
+ match = re.search(r'\b([A-Z])\b', ans)
222
169
  if match:
223
170
  return match.group(1)
171
+
224
172
  return ans
225
173
 
226
174
  @classmethod
227
175
  def _extract_ff_answer(cls, ans: str):
228
176
  """
229
- Extract the answer from the model output for Free-form task.
177
+ Extract the normalized answer for BBH free-form tasks.
178
+ Handles patterns like:
179
+ - "answer is XXX."
180
+ - "The answer is **valid**."
181
+ - Extra trailing dots / line breaks.
182
+ - Bold-marked answers (**xxx**).
183
+ Always uses the *last* occurrence of "answer is".
230
184
  """
231
- res = ResponseParser.parse_first_option(ans)
232
- if res:
233
- return res
234
-
235
- ans_line = ans.split('answer is ')
236
- if len(ans_line) != 1:
237
- ans = ans_line[1].strip()
238
- ans = ans.split('\n')[0]
185
+ ans = ans.strip()
186
+
187
+ parts = ans.split('So the answer is ')
188
+ if len(parts) > 1:
189
+ ans = parts[-1].strip()
190
+ ans = ans.split('\n')[0].strip()
191
+
192
+ # Remove trailing period
239
193
  if ans.endswith('.'):
240
- ans = ans[:-1]
194
+ ans = ans[:-1].strip()
195
+
196
+ # If answer is in bold (**xxx**), prefer the content inside
197
+ match = re.search(r'\*\*(.*?)\*\*', ans)
198
+ if match:
199
+ ans = match.group(1).strip()
200
+
241
201
  return ans
File without changes
File without changes