evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
evalscope/run.py CHANGED
@@ -2,19 +2,16 @@
2
2
  """
3
3
  Run evaluation for LLMs.
4
4
  """
5
- import os.path
5
+ import os
6
6
  from argparse import Namespace
7
7
  from datetime import datetime
8
8
  from typing import TYPE_CHECKING, List, Optional, Union
9
9
 
10
10
  from evalscope.config import TaskConfig, parse_task_config
11
11
  from evalscope.constants import DataCollection, EvalBackend
12
- from evalscope.utils import seed_everything
13
12
  from evalscope.utils.io_utils import OutputsStructure
14
13
  from evalscope.utils.logger import configure_logging, get_logger
15
-
16
- if TYPE_CHECKING:
17
- from evalscope.models import LocalModel
14
+ from evalscope.utils.model_utils import seed_everything
18
15
 
19
16
  logger = get_logger()
20
17
 
@@ -39,25 +36,40 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
39
36
  configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
40
37
 
41
38
  if task_cfg.eval_backend != EvalBackend.NATIVE:
42
- return run_non_native_backend(task_cfg, outputs)
39
+ result = run_non_native_backend(task_cfg, outputs)
43
40
  else:
44
- return evaluate_model(task_cfg, outputs)
41
+ logger.info('Running with native backend')
42
+ result = evaluate_model(task_cfg, outputs)
43
+
44
+ logger.info(f'Finished evaluation for {task_cfg.model_id} on {task_cfg.datasets}')
45
+ logger.info(f'Output directory: {outputs.outputs_dir}')
46
+
47
+ return result
45
48
 
46
49
 
47
50
  def setup_work_directory(task_cfg: TaskConfig, run_time: str):
48
51
  """Set the working directory for the task."""
52
+ # use cache
49
53
  if task_cfg.use_cache:
50
54
  task_cfg.work_dir = task_cfg.use_cache
51
55
  logger.info(f'Set resume from {task_cfg.work_dir}')
52
56
  # elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
53
- task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
57
+ else:
58
+ task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
54
59
 
55
60
  outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
56
61
 
62
+ # Unify the output directory structure
57
63
  if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
58
64
  task_cfg.eval_config['time_str'] = run_time
59
65
  elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
60
66
  task_cfg.eval_config['work_dir'] = task_cfg.work_dir
67
+ elif task_cfg.eval_backend == EvalBackend.RAG_EVAL:
68
+ from evalscope.backend.rag_eval import Tools
69
+ if task_cfg.eval_config['tool'].lower() == Tools.MTEB:
70
+ task_cfg.eval_config['eval']['output_folder'] = task_cfg.work_dir
71
+ elif task_cfg.eval_config['tool'].lower() == Tools.CLIP_BENCHMARK:
72
+ task_cfg.eval_config['eval']['output_dir'] = task_cfg.work_dir
61
73
  return outputs
62
74
 
63
75
 
@@ -83,69 +95,82 @@ def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> d
83
95
  def get_backend_manager_class(eval_backend: EvalBackend):
84
96
  """Get the backend manager class based on the evaluation backend."""
85
97
  if eval_backend == EvalBackend.OPEN_COMPASS:
98
+ logger.info('Using OpenCompassBackendManager')
86
99
  from evalscope.backend.opencompass import OpenCompassBackendManager
87
100
  return OpenCompassBackendManager
88
101
  elif eval_backend == EvalBackend.VLM_EVAL_KIT:
102
+ logger.info('Using VLMEvalKitBackendManager')
89
103
  from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
90
104
  return VLMEvalKitBackendManager
91
105
  elif eval_backend == EvalBackend.RAG_EVAL:
106
+ logger.info('Using RAGEvalBackendManager')
92
107
  from evalscope.backend.rag_eval import RAGEvalBackendManager
93
108
  return RAGEvalBackendManager
94
109
  elif eval_backend == EvalBackend.THIRD_PARTY:
95
110
  raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
96
111
 
97
112
 
98
- def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
113
+ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
99
114
  """Evaluate the model based on the provided task configuration."""
100
- from evalscope.models import get_local_model
115
+ from evalscope.api.evaluator import Evaluator
116
+ from evalscope.api.model import get_model_with_task_config
117
+ from evalscope.api.registry import get_benchmark
118
+ from evalscope.evaluator import DefaultEvaluator
119
+ from evalscope.report import gen_table
101
120
 
102
121
  # Initialize evaluator
103
122
  eval_results = {}
104
- base_model = get_local_model(task_cfg)
105
- evaluators = []
106
- for dataset_name in task_cfg.datasets:
107
- evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model)
123
+ # Initialize model
124
+ model = get_model_with_task_config(task_config=task_config)
125
+ # Initialize evaluators for each dataset
126
+ evaluators: List[Evaluator] = []
127
+ for dataset_name in task_config.datasets:
128
+ # Create evaluator for each dataset
129
+ benchmark = get_benchmark(dataset_name, task_config)
130
+ evaluator = DefaultEvaluator(
131
+ task_config=task_config,
132
+ model=model,
133
+ benchmark=benchmark,
134
+ outputs=outputs,
135
+ )
108
136
  evaluators.append(evaluator)
109
137
 
138
+ # Update task_config.dataset_args with benchmark metadata, except for DataCollection
139
+ if dataset_name != DataCollection.NAME:
140
+ task_config.dataset_args[dataset_name] = benchmark.to_dict()
141
+
110
142
  # dump task_cfg to outputs.configs_dir after creating evaluators
111
- task_cfg.dump_yaml(outputs.configs_dir)
112
- logger.info(task_cfg)
143
+ task_config.dump_yaml(outputs.configs_dir)
144
+ logger.info(task_config)
113
145
 
146
+ # Run evaluation for each evaluator
114
147
  for evaluator in evaluators:
115
- res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
116
- eval_results[dataset_name] = res_dict
148
+ res_dict = evaluator.eval()
149
+ eval_results[evaluator.benchmark.name] = res_dict
150
+
151
+ # Make overall report
152
+ try:
153
+ report_table: str = gen_table(reports_path_list=[outputs.reports_dir], add_overall_metric=True)
154
+ logger.info(f'Overall report table: \n{report_table} \n')
155
+ except Exception:
156
+ logger.error('Failed to generate report table.')
157
+ # Clean up
158
+ if model is not None:
159
+ import gc
160
+
161
+ del model
162
+ del evaluators
163
+ gc.collect()
164
+
165
+ from evalscope.utils.import_utils import check_import
166
+ if check_import('torch', raise_warning=False):
167
+ import torch
168
+ if torch.cuda.is_available():
169
+ torch.cuda.empty_cache()
117
170
 
118
171
  return eval_results
119
172
 
120
173
 
121
- def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: 'LocalModel'):
122
- """Create an evaluator object for the specified dataset."""
123
- from evalscope.benchmarks import Benchmark, BenchmarkMeta
124
- from evalscope.evaluator import Evaluator
125
- from evalscope.models import initialize_model_adapter
126
-
127
- if dataset_name == DataCollection.NAME:
128
- # EvaluatorCollection is a collection of evaluators
129
- from evalscope.collections import EvaluatorCollection
130
- return EvaluatorCollection(task_cfg, outputs)
131
-
132
- benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
133
-
134
- data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
135
- model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
136
-
137
- # update task_cfg.dataset_args
138
- task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
139
-
140
- return Evaluator(
141
- dataset_name_or_path=benchmark.dataset_id,
142
- data_adapter=data_adapter,
143
- model_adapter=model_adapter,
144
- outputs=outputs,
145
- task_cfg=task_cfg,
146
- )
147
-
148
-
149
174
  def main():
150
175
  from evalscope.arguments import parse_args
151
176
  args = parse_args()
evalscope/summarizer.py CHANGED
@@ -7,8 +7,7 @@ from typing import List, Union
7
7
  from evalscope.config import TaskConfig, parse_task_config
8
8
  from evalscope.constants import EvalBackend
9
9
  from evalscope.report import gen_table
10
- from evalscope.utils import csv_to_list, get_latest_folder_path
11
- from evalscope.utils.io_utils import OutputsStructure, json_to_dict, yaml_to_dict
10
+ from evalscope.utils.io_utils import OutputsStructure, csv_to_list, get_latest_folder_path, json_to_dict, yaml_to_dict
12
11
  from evalscope.utils.logger import get_logger
13
12
 
14
13
  logger = get_logger()
@@ -30,7 +29,7 @@ class Summarizer:
30
29
  with open(report_file, 'r') as f:
31
30
  res_list.append(json.load(f))
32
31
 
33
- report_table: str = gen_table([reports_dir])
32
+ report_table: str = gen_table(reports_path_list=[reports_dir])
34
33
  logger.info(f'*** Report table ***\n{report_table}')
35
34
 
36
35
  return res_list
@@ -81,7 +80,7 @@ class Summarizer:
81
80
 
82
81
  summary_file_path = summary_files[0]
83
82
  # Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'} # noqa: E501
84
- summary_res: List[dict] = csv_to_list(file_path=summary_file_path)
83
+ summary_res: List[dict] = csv_to_list(summary_file_path)
85
84
  final_res_list.extend(summary_res)
86
85
  elif eval_backend == EvalBackend.VLM_EVAL_KIT:
87
86
  eval_config = Summarizer.parse_eval_config(candidate_task)
@@ -105,7 +104,8 @@ class Summarizer:
105
104
  summary_res: dict = csv_to_list(summary_file_path)[0]
106
105
  elif summary_file_path.endswith('json'):
107
106
  summary_res: dict = json_to_dict(summary_file_path)
108
- file_name = os.path.basename(summary_file_path).split('.')[0]
107
+ base_name = os.path.basename(summary_file_path)
108
+ file_name = os.path.splitext(base_name)[0]
109
109
  final_res_list.append({file_name: summary_res})
110
110
 
111
111
  elif eval_backend == EvalBackend.THIRD_PARTY:
@@ -8,7 +8,7 @@ import random
8
8
  import torch
9
9
  from typing import List
10
10
 
11
- from evalscope.models.api import OpenaiApi
11
+ from evalscope.third_party.longbench_write.tools.openai_api import OpenaiApi
12
12
  from evalscope.third_party.longbench_write.utils import count_words
13
13
  from evalscope.utils import get_logger
14
14
 
@@ -0,0 +1,3 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.third_party.thinkbench.eval import run_task
@@ -0,0 +1,441 @@
1
+ import json
2
+ import os
3
+ import pandas as pd
4
+ import plotly.graph_objects as go
5
+ import re
6
+ from collections import defaultdict
7
+ from functools import lru_cache
8
+ from modelscope import AutoTokenizer
9
+ from plotly.subplots import make_subplots
10
+ from tqdm.contrib.concurrent import thread_map
11
+ from typing import List
12
+
13
+ from evalscope.third_party.thinkbench.tools.llm import request_url
14
+ from evalscope.third_party.thinkbench.tools.utils import extract_answer
15
+ from evalscope.utils.io_utils import dict_to_json, dump_jsonl_data, json_to_dict, jsonl_to_list
16
+
17
+ cur_path = os.path.dirname(os.path.abspath(__file__))
18
+
19
+ class EvalThink:
20
+ def __init__(self, report_path, tokenizer_path, model_name, dataset_name, subsets, split_strategies='llm', judge_config=None):
21
+ self.report_path = report_path
22
+ self.reformat_template = open(os.path.join(cur_path, 'resources/reformat_template.txt'), 'r').read()
23
+ self.critique_template = open(os.path.join(cur_path, 'resources/critique_template.txt'), 'r').read()
24
+ self.switch_tokens = ['alternatively', 'but wait', 'let me reconsider', 'another way', 'another approach', 'another method', 'another angle']
25
+ self.subset_dict = defaultdict(lambda: defaultdict(list))
26
+ self.think_end_token = '</think>'
27
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
28
+ self.model_name = model_name
29
+ self.dataset_name = dataset_name
30
+ self.subsets = subsets
31
+ self.metrics = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens','token_efficiency', 'thought_num', 'accuracy']
32
+ self.split_strategies = split_strategies # split by llm, keywords, separator
33
+ self.judge_config = judge_config
34
+ self.model_parse_file_path = os.path.join(self.report_path, 'answer_index.jsonl')
35
+ self.model_parse_dict = self.__init_parse_file()
36
+
37
+ def __init_parse_file(self):
38
+ if not os.path.exists(self.model_parse_file_path):
39
+ return {}
40
+ else:
41
+ list_file = jsonl_to_list(self.model_parse_file_path)
42
+ # convert to dict prompt as key, answer_index as value
43
+ return {item['prompt']: item['answer_index'] for item in list_file}
44
+
45
+ def get_think_part(self, message: dict) -> str:
46
+ if 'reasoning_content' in message and message['reasoning_content']:
47
+ return message['reasoning_content']
48
+ else:
49
+ text = message['content']
50
+ last_think_end = text.rfind(self.think_end_token)
51
+ return text[:last_think_end]
52
+
53
+ @lru_cache(maxsize=None)
54
+ def cal_tokens(self, text: str):
55
+ return len(self.tokenizer.encode(text, add_special_tokens=False))
56
+
57
+ def process_choice(self, choice, problem):
58
+ think_part = self.get_think_part(choice['message'])
59
+ answer = choice['review']['gold']
60
+ tokens = self.cal_tokens(think_part)
61
+ switch_count = sum(think_part.lower().count(token) for token in self.switch_tokens)
62
+ useful_tokens = self.cal_tokens(self.get_first_correct(think_part, problem, answer))
63
+ reflection_tokens = tokens - useful_tokens
64
+ # score = choice['review']['result']
65
+ score = 0 if useful_tokens == 0 else 1
66
+ return tokens, switch_count, useful_tokens, reflection_tokens, score
67
+
68
+ def process_item(self, item):
69
+ problem = item['raw_input'].get('question') or item['raw_input'].get('problem') or ''
70
+ results = []
71
+ for choice in item['choices']:
72
+ results.append(self.process_choice(choice, problem))
73
+ break # only process the first choice
74
+
75
+ total_tokens, switch_counts, useful_tokens, reflection_tokens, scores = zip(*results)
76
+
77
+ avg_tokens = sum(total_tokens) / len(total_tokens)
78
+ avg_thought_num = sum(switch_counts) / len(switch_counts)
79
+ avg_token_efficiency = sum(useful_tokens) / sum(total_tokens)
80
+ avg_accuracy = sum(scores) / len(scores)
81
+ avg_useful_tokens = sum(useful_tokens) / len(useful_tokens)
82
+ avg_reflection_tokens = sum(reflection_tokens) / len(reflection_tokens)
83
+ return avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy, avg_useful_tokens, avg_reflection_tokens
84
+
85
+ def split_by_llm(self, response, problem) -> List[str]:
86
+ response = response.replace('\n', ' ') # remove newline characters
87
+ prompt = self.reformat_template.format(problem=problem, response=response)
88
+ llm_response = request_url(self.judge_config, prompt)
89
+ return llm_response.split('\n\n')
90
+
91
+ def split_by_keywords(self, text) -> List[str]:
92
+ pattern = r'(?=\b(?:{})\b)'.format('|'.join(map(re.escape, self.switch_tokens)))
93
+ segments = re.split(pattern, text)
94
+ # remove empty segments
95
+ segments = [segment.strip() for segment in segments if segment.strip()]
96
+
97
+ return segments if segments else [text]
98
+
99
+ def split_by_separator(self, text) -> List[str]:
100
+ return text.split('\n\n')
101
+
102
+ def get_answer_index(self, response: List[str], problem: str, answer: str) -> int:
103
+ tagged_response = ''
104
+ for sdx, step in enumerate(response):
105
+ tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
106
+ tagged_response = tagged_response.strip()
107
+
108
+ prompt = self.critique_template.format(problem=problem, answer=answer, tagged_response=tagged_response)
109
+ if prompt in self.model_parse_dict:
110
+ answer_index = self.model_parse_dict[prompt]
111
+ else:
112
+ llm_response = request_url(self.judge_config, prompt)
113
+ if not llm_response:
114
+ answer_index = -1
115
+ else:
116
+ answer_index = extract_answer(llm_response)
117
+
118
+ dump_jsonl_data({'prompt': prompt, 'response': llm_response, 'answer_index': answer_index},
119
+ self.model_parse_file_path, dump_mode='append')
120
+ try:
121
+ answer_index = int(answer_index)
122
+ except Exception:
123
+ answer_index = -1
124
+ return answer_index
125
+
126
+ def get_first_correct(self, response: str, problem: str, answer: str) -> str:
127
+ if self.split_strategies == 'llm':
128
+ text_list = self.split_by_llm(response, problem)
129
+ elif self.split_strategies == 'keywords':
130
+ text_list = self.split_by_keywords(response)
131
+ else:
132
+ text_list = self.split_by_separator(response)
133
+
134
+ answer_index = self.get_answer_index(text_list, problem, answer)
135
+
136
+ if answer_index == -1: # no correct answer found
137
+ first_correct = ''
138
+ else:
139
+ first_correct = '\n\n'.join(text_list[: answer_index])
140
+ return first_correct
141
+
142
+ def plot_metrics(self, results, output_dir):
143
+ # Change layout to 2x3
144
+ fig = make_subplots(rows=2, cols=3,
145
+ subplot_titles=('Reasoning Tokens', 'First Correct Tokens', 'Reflection Tokens',
146
+ 'Token Efficiency', 'Thought Num', 'Accuracy'),
147
+ shared_xaxes=True, x_title='Subsets',
148
+ vertical_spacing=0.1, # Decrease vertical spacing between subplots
149
+ horizontal_spacing=0.1) # Decrease horizontal spacing between subplots
150
+
151
+ metrics_order = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens',
152
+ 'token_efficiency', 'thought_num', 'accuracy']
153
+
154
+ for i, metric in enumerate(metrics_order, start=1):
155
+ y_values = [results[metric][subset] for subset in self.subsets]
156
+ # Determine row and column for 2x3 layout
157
+ row = (i - 1) // 3 + 1
158
+ col = (i - 1) % 3 + 1
159
+ fig.add_trace(
160
+ go.Scatter(x=list(range(len(self.subsets))), y=y_values,
161
+ mode='lines+markers',
162
+ name=metric.replace('_', ' ').title()),
163
+ row=row, col=col
164
+ )
165
+ # Add annotations for each data point
166
+ for j, y in enumerate(y_values):
167
+ fig.add_annotation(
168
+ x=j,
169
+ y=y,
170
+ text=f'{y:.2f}',
171
+ showarrow=False,
172
+ yshift=10,
173
+ row=row,
174
+ col=col
175
+ )
176
+
177
+ fig.update_layout(
178
+ height=800, # Adjust height for 2x3 layout
179
+ width=1200, # Adjust width for 2x3 layout
180
+ title_text=f'Evaluation Metrics for {self.model_name} on {self.dataset_name}',
181
+ legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
182
+ )
183
+
184
+ for i in range(1, len(metrics_order) + 1):
185
+ row = (i - 1) // 3 + 1
186
+ col = (i - 1) % 3 + 1
187
+ fig.update_xaxes(
188
+ ticktext=self.subsets,
189
+ tickvals=list(range(len(self.subsets))),
190
+ row=row, col=col
191
+ )
192
+ fig.update_yaxes(title_text=metrics_order[i-1].replace('_', ' ').title(), row=row, col=col)
193
+
194
+ # Update y-axis ranges
195
+ fig.update_yaxes(range=[500, 5000], row=1, col=1) # Reasoning Tokens
196
+ fig.update_yaxes(range=[0, 3000], row=1, col=2) # First Correct Tokens
197
+ fig.update_yaxes(range=[0, 3000], row=1, col=3) # Reflection Tokens
198
+ fig.update_yaxes(range=[0, 1], row=2, col=1) # Token Efficiency
199
+ fig.update_yaxes(range=[0, 13], row=2, col=2) # Thought Num
200
+ fig.update_yaxes(range=[0, 1], row=2, col=3) # Accuracy
201
+
202
+ os.makedirs(output_dir, exist_ok=True)
203
+ output_path = os.path.join(output_dir, f'{self.model_name}_{self.dataset_name}_metrics.png')
204
+ fig.write_image(output_path)
205
+ print(f'save figure to: {output_path}')
206
+
207
+
208
+
209
+ def filter_df(self, df, response_len: int = 8000, count: int=10):
210
+ def is_valid_row(row):
211
+ return all(self.cal_tokens(choice['message']['content']) <= response_len for choice in row['choices'])
212
+
213
+ bools = df.apply(is_valid_row, axis=1)
214
+
215
+ return df[bools].head(count)
216
+
217
+
218
+ def evaluate(self, output_dir, max_tokens=8000, count=50, workers=128):
219
+ for subset in self.subsets:
220
+ review_path = os.path.join(self.report_path, 'reviews', self.model_name, f'{self.dataset_name}_{subset}.jsonl')
221
+ review_df = pd.read_json(review_path, lines=True)
222
+
223
+ review_df = self.filter_df(review_df, response_len=max_tokens, count=count)
224
+
225
+ results = thread_map(
226
+ self.process_item,
227
+ (item for _, item in review_df.iterrows()),
228
+ desc=f'Evaluating {subset}',
229
+ total=len(review_df),
230
+ max_workers=workers
231
+ )
232
+
233
+ avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy, avg_useful_tokens, avg_reflection_tokens = zip(*results)
234
+
235
+ self.subset_dict[subset]['reasoning_tokens'] = sum(avg_tokens) / len(avg_tokens)
236
+ self.subset_dict[subset]['thought_num'] = sum(avg_thought_num) / len(avg_thought_num)
237
+ self.subset_dict[subset]['token_efficiency'] = sum(avg_token_efficiency) / len(avg_token_efficiency)
238
+ self.subset_dict[subset]['accuracy'] = sum(avg_accuracy) / len(avg_accuracy)
239
+ self.subset_dict[subset]['first_correct_tokens'] = sum(avg_useful_tokens) / len(avg_useful_tokens)
240
+ self.subset_dict[subset]['reflection_tokens'] = sum(avg_reflection_tokens) / len(avg_reflection_tokens)
241
+
242
+
243
+ results = {metric: {subset: self.subset_dict[subset][metric] for subset in self.subsets}
244
+ for metric in self.metrics}
245
+
246
+ self.plot_metrics(results, output_dir)
247
+
248
+ # save results to json
249
+ dict_to_json(results, os.path.join(self.report_path, f'think_eval_results.json'))
250
+ return results
251
+
252
+ def run_task(config, output_dir='outputs', max_tokens=8000, count=50, workers=128):
253
+ evaluator = EvalThink(**config,)
254
+ results = evaluator.evaluate(output_dir, max_tokens, count, workers)
255
+ print(results)
256
+
257
+ def combine_results(configs: List[dict], output_path: str):
258
+ """
259
+ Combine evaluation results from multiple model configs into one plot.
260
+ All models' results for the same metric will be shown in the same subplot for easy comparison.
261
+
262
+ Args:
263
+ configs: List of model config dicts containing model_name and report_path
264
+ """
265
+ # Combine results from different runs
266
+ combined_results = defaultdict(lambda: defaultdict(dict))
267
+ for config in configs:
268
+ model_name = config['model_name']
269
+ report_path = config['report_path']
270
+ # Results is a dict with metric as key and subset as value
271
+ results = json_to_dict(os.path.join(report_path, f'think_eval_results.json'))
272
+ combined_results[model_name] = results
273
+
274
+ # Create a 2x3 subplot layout, one subplot per metric
275
+ fig = make_subplots(rows=2, cols=3,
276
+ subplot_titles=('Reasoning Tokens', 'First Correct Tokens', 'Reflection Tokens',
277
+ 'Token Efficiency', 'Thought Num', 'Accuracy'),
278
+ shared_xaxes=True, x_title='Subsets',
279
+ vertical_spacing=0.08, # 减小垂直间距
280
+ horizontal_spacing=0.05) # 减小水平间距
281
+
282
+ metrics_order = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens',
283
+ 'token_efficiency', 'thought_num', 'accuracy']
284
+
285
+ # Assign different colors for each model
286
+ colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
287
+
288
+ # Plot each metric in a separate subplot
289
+ for i, metric in enumerate(metrics_order, start=1):
290
+ row = (i - 1) // 3 + 1
291
+ col = (i - 1) % 3 + 1
292
+
293
+ # Get subsets from first model (assuming all models have same subsets)
294
+ subsets = list(next(iter(combined_results.values()))[metric].keys())
295
+
296
+ # Add all models' data for this metric to the same subplot
297
+ for j, (model_name, results) in enumerate(combined_results.items()):
298
+ y_values = [results[metric][subset] for subset in subsets]
299
+
300
+ fig.add_trace(
301
+ go.Scatter(x=subsets, y=y_values,
302
+ mode='lines+markers',
303
+ name=model_name, # Just model name since metrics are shown in subplot titles
304
+ line=dict(color=colors[j % len(colors)]),
305
+ showlegend=(i == 1)), # Only show legend for first metric
306
+ row=row, col=col
307
+ )
308
+
309
+ # Add value annotations
310
+ for k, y in enumerate(y_values):
311
+ fig.add_annotation(
312
+ x=subsets[k],
313
+ y=y,
314
+ text=f'{y:.2f}',
315
+ showarrow=False,
316
+ yshift=10,
317
+ font=dict(size=12, color=colors[j % len(colors)]),
318
+ row=row, col=col
319
+ )
320
+
321
+ # Update axis ranges and labels based on metric type
322
+ # if metric == 'token_efficiency':
323
+ # fig.update_yaxes(range=[0.2, 0.7], row=row, col=col)
324
+ # elif metric == 'accuracy':
325
+ # fig.update_yaxes(range=[0.8, 1], row=row, col=col)
326
+
327
+ fig.update_yaxes(title_text=metric.replace('_', ' ').title(), row=row, col=col)
328
+
329
+ # Update layout
330
+ fig.update_layout(
331
+ height=1000, # 增加高度
332
+ width=1500, # 增加宽度
333
+ title_text=f'Model Comparison Across Evaluation Metrics on MATH-500',
334
+ title=dict(font=dict(size=22)), # 增大标题字号
335
+ font=dict(size=14), # 增大整体字号
336
+ legend=dict(
337
+ orientation='h',
338
+ yanchor='bottom',
339
+ y=1.02,
340
+ xanchor='right',
341
+ x=1,
342
+ font=dict(size=14) # 增大图例字号
343
+ )
344
+ )
345
+
346
+ # Save plot
347
+ os.makedirs('outputs', exist_ok=True)
348
+ fig.write_image(output_path)
349
+ print(f'Model comparison plot saved to {output_path}')
350
+
351
+ return combined_results
352
+
353
+ judge_config = dict(
354
+ api_key='EMPTY',
355
+ base_url='http://0.0.0.0:8801/v1',
356
+ model_name='Qwen2.5-72B-Instruct',
357
+ )
358
+
359
+ distill_qwen_config = dict(
360
+ report_path = '../eval-scope/outputs/20250218_180219',
361
+ model_name = 'DeepSeek-R1-Distill-Qwen-7B',
362
+ tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
363
+ dataset_name = 'math_500',
364
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
365
+ split_strategies='separator',
366
+ judge_config=judge_config
367
+ )
368
+
369
+ math_qwen_config = dict(
370
+ report_path = '../eval-scope/outputs/20250219_202358',
371
+ model_name = 'Qwen2.5-Math-7B-Instruct',
372
+ tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
373
+ dataset_name = 'math_500',
374
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
375
+ split_strategies='separator',
376
+ judge_config=judge_config
377
+ )
378
+
379
+ r1_config = dict(
380
+ report_path = '../eval-scope/outputs/20250307_000404',
381
+ model_name = 'deepseek-r1',
382
+ tokenizer_path = 'deepseek-ai/DeepSeek-R1',
383
+ dataset_name = 'math_500',
384
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
385
+ split_strategies='separator',
386
+ judge_config=judge_config
387
+ )
388
+
389
+ qwq_preview_config = dict(
390
+ report_path = '../eval-scope/outputs/20250221_105911',
391
+ model_name = 'qwq-32b-preview',
392
+ tokenizer_path = 'Qwen/QwQ-32B-Preview',
393
+ dataset_name = 'math_500',
394
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
395
+ split_strategies='separator',
396
+ judge_config=judge_config
397
+ )
398
+
399
+ qwq_config = dict(
400
+ report_path = '../eval-scope/outputs/20250306_181550',
401
+ model_name = 'QwQ-32B',
402
+ tokenizer_path = 'Qwen/QwQ-32B',
403
+ dataset_name = 'math_500',
404
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
405
+ split_strategies='separator',
406
+ judge_config=judge_config
407
+ )
408
+
409
+ distill_qwen_32b = dict(
410
+ report_path = '../eval-scope/outputs/20250306_235951',
411
+ model_name = 'deepseek-r1-distill-qwen-32b',
412
+ tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
413
+ dataset_name = 'math_500',
414
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
415
+ split_strategies='separator',
416
+ judge_config=judge_config
417
+ )
418
+
419
+ qwen3_32b_think = dict(
420
+ report_path = '../eval-scope/outputs/20250428_151817',
421
+ model_name = 'Qwen3-32B',
422
+ tokenizer_path = 'Qwen/Qwen3-32B',
423
+ dataset_name = 'math_500',
424
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
425
+ split_strategies='separator',
426
+ judge_config=judge_config
427
+ )
428
+
429
+ if __name__ == '__main__':
430
+ # run_task(distill_qwen_config, count=80)
431
+ # run_task(math_qwen_config)
432
+ # run_task(qwq_preview_config, max_tokens=20000, count=200, workers=128)
433
+ # run_task(r1_config, max_tokens=20000, count=200, workers=128)
434
+ # run_task(qwq_config, max_tokens=20000, count=200, workers=128)
435
+ run_task(qwen3_32b_think, max_tokens=20000, count=200, workers=128)
436
+ # run_task(distill_qwen_32b, max_tokens=20000, count=200, workers=128)
437
+
438
+ # combine_results([qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics.png')
439
+ # combine_results([qwq_config, r1_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_3models.png')
440
+ # combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png')
441
+ combine_results([qwq_config, r1_config, distill_qwen_32b, qwen3_32b_think], output_path='outputs/model_comparison_metrics_4models.png')