evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,377 +1,393 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ """
3
+ Default evaluator implementation for running benchmark evaluations.
4
+
5
+ This module provides the DefaultEvaluator class which orchestrates the entire
6
+ evaluation process including data loading, model inference, metric calculation,
7
+ and report generation.
8
+ """
2
9
 
3
- import json
4
10
  import os
5
- import time
6
- from collections import OrderedDict
7
- from copy import deepcopy
11
+ import traceback
12
+ from collections import defaultdict
8
13
  from tqdm import tqdm
9
- from typing import Any, Dict, List, Optional, Union
14
+ from typing import TYPE_CHECKING, Callable, Dict, List
10
15
 
11
- from evalscope.benchmarks import DataAdapter
12
- from evalscope.config import TaskConfig
13
- from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
14
- from evalscope.models import BaseModelAdapter, CustomModelAdapter
16
+ from evalscope.api.dataset import Dataset, DatasetDict, Sample
17
+ from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
18
+ from evalscope.api.metric import AggScore, SampleScore
19
+ from evalscope.constants import HEARTBEAT_INTERVAL_SEC
15
20
  from evalscope.report import Report, gen_table
16
- from evalscope.utils import dict_torch_dtype_to_str, gen_hash
17
- from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
21
+ from evalscope.utils.function_utils import run_in_threads_with_progress
18
22
  from evalscope.utils.logger import get_logger
19
23
 
24
+ if TYPE_CHECKING:
25
+ from evalscope.api.benchmark import DataAdapter
26
+ from evalscope.api.model import Model
27
+ from evalscope.config import TaskConfig
28
+ from evalscope.utils.io_utils import OutputsStructure
29
+
20
30
  logger = get_logger()
21
31
 
22
32
 
23
- class Evaluator(object):
33
+ class DefaultEvaluator(Evaluator):
24
34
  """
25
- The evaluator for model on datasets.
35
+ Default Evaluator for running evaluations on benchmarks.
36
+
37
+ This evaluator handles the complete evaluation pipeline:
38
+ 1. Loading datasets from benchmarks
39
+ 2. Running model inference on samples
40
+ 3. Calculating evaluation metrics
41
+ 4. Generating and saving reports
42
+ 5. Managing caching for predictions and reviews
26
43
 
27
44
  Args:
28
- dataset_name_or_path: str, the dataset name or path.
29
- if the dataset is a local path, e.g. /path/to/your_dataset_name,
30
- then the task name will be the basename of the path, which is `your_dataset_name`.
31
- data_adapter: DataAdapter, the data adapter for the dataset.
32
- model_adapter: BaseModelAdapter, the model adapter for the model.
33
- outputs: OutputsStructure, the outputs dir. Default: None
34
- task_cfg: TaskConfig, the overall task config. Default: None
35
- **kwargs: kwargs.
45
+ benchmark: The data adapter for loading and processing data.
46
+ model: The model to be evaluated.
47
+ outputs: The output structure for saving evaluation results.
48
+ task_config: The task configuration.
36
49
  """
37
50
 
38
- def __init__(self,
39
- dataset_name_or_path: str,
40
- data_adapter: DataAdapter,
41
- model_adapter: BaseModelAdapter,
42
- outputs: OutputsStructure = None,
43
- task_cfg: TaskConfig = None,
44
- **kwargs):
45
-
46
- self.dataset_name = data_adapter.name
47
- self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
48
- self.model_name = task_cfg.model_id
49
- self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
50
-
51
- self.data_adapter = data_adapter
52
- self.model_adapter = model_adapter
53
- self.model_cfg = model_adapter.model_cfg
54
- self.eval_type = task_cfg.eval_type
55
- self.dataset_hub = task_cfg.dataset_hub
56
- self.stage = task_cfg.stage
57
- self.use_cache = task_cfg.use_cache
58
- self.task_cfg = task_cfg
59
- # Deal with the output paths
60
- self.outputs_structure = outputs
61
-
62
- self.kwargs = kwargs
63
-
64
- def load_dataset(self):
65
- dataset = self.data_adapter.load(
66
- dataset_name_or_path=self.dataset_name_or_path,
67
- subset_list=self.data_adapter.subset_list,
68
- work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
69
- datasets_hub=self.dataset_hub,
70
- **self.kwargs)
71
-
72
- # Get prompts from dataset
73
- prompts = self.data_adapter.gen_prompts(data_dict=dataset)
74
- return prompts
75
-
76
- def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
77
- model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
78
- input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False)
79
- infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
80
- return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
81
-
82
- def _process_answer(self, answer_d, input_d, subset_name, answer_id):
83
- answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
84
- answer_d[AnswerKeys.ANSWER_ID] = answer_id
85
- answer_d[AnswerKeys.SUBSET_NAME] = subset_name
86
- answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
87
- answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
88
- return answer_d
89
-
90
- def get_answers(self,
91
- subset_name: str,
92
- prompts_list: List[dict],
93
- infer_cfg: dict = None,
94
- debug: bool = False,
95
- **kwargs) -> list:
96
- """
97
- Get answers from model inference.
98
- It is required to rewrite this method to support your own evaluator.
51
+ def __init__(
52
+ self,
53
+ benchmark: 'DataAdapter',
54
+ model: 'Model',
55
+ outputs: 'OutputsStructure',
56
+ task_config: 'TaskConfig',
57
+ ):
58
+ # Store core components needed for evaluation
59
+ self.benchmark = benchmark
60
+ self.model = model
61
+ self.outputs = outputs
62
+ self.task_config = task_config
63
+
64
+ # Extract frequently used identifiers
65
+ self.benchmark_name = benchmark.name
66
+ """Name of the benchmark being evaluated."""
67
+
68
+ self.model_name = task_config.model_id
69
+ """ID of the model being evaluated."""
70
+
71
+ self.use_cache = task_config.use_cache
72
+ """Whether to use cache for predictions."""
73
+
74
+ # Initialize cache manager for storing and retrieving cached results
75
+ self.cache_manager = CacheManager(
76
+ outputs=outputs,
77
+ model_name=self.model_name,
78
+ benchmark_name=self.benchmark_name,
79
+ )
99
80
 
100
- Args:
101
- subset_name: subset name for benchmark.
102
- prompts_list: prompts list.
103
- infer_cfg: model inference config.
104
- Attributes:
105
- do_sample: bool, whether to use sampling.
106
- top_k: int, the number of highest probability vocabulary tokens to keep for top-k-filtering.
107
- top_p: float, if set to float < 1, only the most probable tokens with probabilities to add.
108
- temperature: float, the value used to module the next token probabilities.
109
- num_beams: int, number of beams for beam search. 1 means no beam search.
110
- max_length: int, the max length of the sequence to be generated.
111
- max_new_tokens: int, the max number of new tokens to be generated.
112
- repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
113
- debug: whether to run in debug mode.
114
- **kwargs: kwargs.
115
-
116
- Returns: The list of answers.
81
+ def eval(self) -> Report:
117
82
  """
118
- assert self.data_adapter is not None, 'data_adapter must be provided when calling func get_answers() !'
119
- assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
120
- assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
83
+ Run the complete evaluation process.
121
84
 
122
- answers_list = []
123
- pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
124
- pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
125
- os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
85
+ This is the main entry point that orchestrates the entire evaluation:
86
+ 1. Load dataset from benchmark
87
+ 2. Evaluate each subset independently
88
+ 3. Aggregate scores across subsets
89
+ 4. Generate final evaluation report
126
90
 
127
- if self.use_cache and os.path.exists(pred_file_path):
128
- answers_list = jsonl_to_list(pred_file_path)
129
- logger.info(f'Reusing predictions from {pred_file_path}, got {len(answers_list)} answers.')
130
- # Note: assume prediction in order of prompts_list
131
- prompts_list = prompts_list[len(answers_list):]
132
-
133
- if isinstance(self.model_adapter, CustomModelAdapter):
134
- # Batch inference for custom model
91
+ Returns:
92
+ Report: The complete evaluation report containing all metrics and results.
93
+ """
94
+ # Load the dataset and evaluate each subset
95
+ logger.info(f'Start evaluating benchmark: {self.benchmark_name}')
96
+ dataset_dict = self.benchmark.load_dataset()
97
+ agg_score_dict = defaultdict(list)
98
+
99
+ # Process each subset (e.g., test, validation) independently
100
+ logger.info('Evaluating all subsets of the dataset...')
101
+ for subset, dataset in dataset_dict.items():
102
+ if len(dataset) == 0:
103
+ logger.info(f'No samples found in subset: {subset}, skipping.')
104
+ continue
105
+ logger.info(f'Evaluating subset: {subset}')
106
+ subset_score = self.evaluate_subset(subset, dataset)
107
+ agg_score_dict[subset] = subset_score
135
108
 
136
- resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
137
- inputs=prompts_list, infer_cfg=infer_cfg)
109
+ # Generate the report based on aggregated scores
110
+ logger.info('Generating report...')
111
+ report = self.get_report(agg_score_dict)
138
112
 
139
- for input_prompt, answer_d in zip(prompts_list, resp_answers_list):
140
- answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
141
- processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
142
- answers_list.append(processed_answer)
143
- dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
113
+ # Finalize the evaluation process
114
+ self.finalize()
115
+ logger.info(f'Benchmark {self.benchmark_name} evaluation finished.')
116
+ return report
144
117
 
145
- else:
146
- for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
147
- answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
148
- answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
149
- processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
150
-
151
- if debug:
152
- logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
153
- logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n')
154
-
155
- answers_list.append(processed_answer)
156
- dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
157
-
158
- logger.info(f'Dump predictions to {pred_file_path}.')
159
- return answers_list
160
-
161
- def _get_review(self, answer_d: dict, review_id: str = None, reviewer_spec: dict = None) -> dict:
162
-
163
- if reviewer_spec is None:
164
- reviewer_spec = {}
165
-
166
- review_res = deepcopy(answer_d)
167
- choices = review_res[AnswerKeys.CHOICES]
168
- if len(choices) == 0:
169
- review_res[ReviewKeys.REVIEWED] = False
170
- review_res[ReviewKeys.REVIEW_ID] = None
171
- review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
172
- review_res[ReviewKeys.REVIEW_TIME] = time.time()
173
- return review_res
174
-
175
- rev_choices = []
176
- for choice in choices:
177
- raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
178
- answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
179
- answer_content = self.data_adapter.parse_pred_result(
180
- result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
181
- gold_content = self.data_adapter.get_gold_answer(raw_input_d)
182
-
183
- review_result = self.data_adapter.match(gold_content, answer_content)
184
- choice[ReviewKeys.REVIEW] = {
185
- ReviewKeys.GOLD: gold_content,
186
- ReviewKeys.PRED: answer_content,
187
- ReviewKeys.RESULT: review_result
188
- }
189
-
190
- rev_choices.append(choice)
191
-
192
- review_res[AnswerKeys.CHOICES] = rev_choices
193
- review_res[ReviewKeys.REVIEWED] = True
194
- review_res[ReviewKeys.REVIEW_ID] = review_id
195
- review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
196
- review_res[ReviewKeys.REVIEW_TIME] = time.time()
197
-
198
- return review_res
199
-
200
- def _generate_review_id(self, answer_d):
201
- # Gen review_id (concat: answer_id + reviewer_spec)
202
- answer_id = answer_d[AnswerKeys.ANSWER_ID]
203
- reviewer_spec = {
204
- 'metric': [metric.name for metric in self.data_adapter.metric_list],
205
- 'reviewer': ['Evaluator'],
206
- 'revision': ['default']
207
- }
208
- reviewer_spec_str = json.dumps(
209
- OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
210
- review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
211
- return review_id, reviewer_spec
212
-
213
- def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
118
+ def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
214
119
  """
215
- Get reviews from answers.
216
- It is required to rewrite this method to support your own evaluator.
120
+ Evaluate a single subset of the dataset.
121
+
122
+ This method processes one subset through the complete evaluation pipeline:
123
+ 1. Get model predictions for all samples
124
+ 2. Calculate evaluation metrics for predictions
125
+ 3. Aggregate individual sample scores
217
126
 
218
127
  Args:
219
- subset_name: subset name of benchmark
220
- answers_list: inference results list.
221
- debug: whether to run in debug mode.
222
- **kwargs: kwargs.
128
+ subset: Name of the subset being evaluated (e.g., 'test', 'validation').
129
+ dataset: The dataset subset containing samples to evaluate.
223
130
 
224
- Returns: reviews list.
131
+ Returns:
132
+ List[AggScore]: Aggregated scores for this subset.
225
133
  """
226
- reviews_list = []
227
-
228
- review_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
229
- review_file_path = os.path.join(self.outputs_structure.reviews_dir, self.model_name, review_file_name)
230
- os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
134
+ # Get model predictions for all samples in the subset
135
+ logger.info(f'Getting predictions for subset: {subset}')
136
+ task_states = self.get_answers(subset, dataset)
231
137
 
232
- if self.use_cache and os.path.exists(review_file_path):
233
- logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
138
+ # Calculate evaluation metrics for each prediction
139
+ logger.info(f'Getting reviews for subset: {subset}')
140
+ sample_scores = self.get_reviews(subset, task_states)
234
141
 
235
- for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
236
- review_id, reviewer_spec = self._generate_review_id(answer_d)
237
- # Get review
238
- review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
142
+ # Aggregate individual sample scores into subset-level metrics
143
+ logger.info(f'Aggregating scores for subset: {subset}')
144
+ agg_scores = self.benchmark.aggregate_scores(sample_scores=sample_scores)
145
+ return agg_scores
239
146
 
240
- if debug:
241
- logger.info(review_d)
147
+ def get_answers(self, subset: str, dataset: Dataset) -> List[TaskState]:
148
+ """
149
+ Get model predictions for all samples in the dataset subset.
242
150
 
243
- reviews_list.append(review_d)
244
- # Dump reviews
245
- dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
151
+ This method handles:
152
+ 1. Loading cached predictions if available and caching is enabled
153
+ 2. Running model inference on remaining samples in parallel
154
+ 3. Saving new predictions to cache
246
155
 
247
- return reviews_list
156
+ Args:
157
+ subset: Name of the subset being processed.
158
+ dataset: The dataset subset containing samples for prediction.
248
159
 
249
- def compute_metrics(self, reviews_list: List[dict]) -> List[dict]:
160
+ Returns:
161
+ List[TaskState]: Task states containing model predictions for each sample.
162
+ """
163
+ # Initialize task state list and filter cached predictions if caching is enabled
164
+ if self.use_cache:
165
+ cached_task_state_list, dataset = self.cache_manager.filter_prediction_cache(subset, dataset)
166
+ else:
167
+ cached_task_state_list = []
168
+
169
+ # Get output directory for storing model predictions
170
+ model_prediction_dir = os.path.dirname(self.cache_manager.get_prediction_cache_path(subset))
171
+
172
+ # Convert dataset to list for parallel processing
173
+ dataset_list = list(dataset)
174
+ if not dataset_list:
175
+ return cached_task_state_list
176
+
177
+ logger.info(f'Processing {len(dataset_list)} samples, if data is large, it may take a while.')
178
+
179
+ def worker(sample: Sample) -> TaskState:
180
+ return self._predict_sample(sample, model_prediction_dir)
181
+
182
+ def on_result(sample: Sample, task_state: TaskState) -> None:
183
+ model_result = self.cache_manager.save_prediction_cache(subset, task_state, self.benchmark.save_metadata)
184
+ logger.debug(f'Model result: \n{model_result.pretty_print()}')
185
+
186
+ def on_error(sample: Sample, exc: Exception) -> None:
187
+ tb_str = traceback.format_exc()
188
+ logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}\nTraceback:\n{tb_str}')
189
+ if self.task_config.ignore_errors:
190
+ logger.warning('Error ignored, continuing with next sample.')
191
+ return
192
+ raise exc
193
+
194
+ finished_task_states = run_in_threads_with_progress(
195
+ dataset_list,
196
+ worker,
197
+ desc=f'Predicting[{self.benchmark_name}@{subset}]: ',
198
+ max_workers=self.task_config.eval_batch_size,
199
+ heartbeat_sec=HEARTBEAT_INTERVAL_SEC,
200
+ on_result=on_result,
201
+ on_error=on_error,
202
+ filter_none_results=True,
203
+ )
204
+
205
+ logger.info(f'Finished getting predictions for subset: {subset}.')
206
+ return cached_task_state_list + finished_task_states
207
+
208
+ def _predict_sample(self, sample: Sample, model_prediction_dir: str) -> TaskState:
250
209
  """
251
- To compute metrics from reviews_list for each subset.
252
- It is required to rewrite this method to support your own evaluator.
210
+ Helper method to predict a single sample.
253
211
 
254
212
  Args:
255
- reviews_list: reviews list.
213
+ sample: The sample to predict.
214
+ model_prediction_dir: Directory for storing model predictions.
256
215
 
257
216
  Returns:
258
- The metric result. Depends on the metric function in data_adapter.
217
+ TaskState: The task state containing the prediction result.
259
218
  """
219
+ logger.debug(f'\n{sample.pretty_print()}')
260
220
 
261
- review_res_list = []
262
- for review_d in reviews_list:
263
- if not review_d[ReviewKeys.REVIEWED]:
264
- logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
265
- continue
221
+ # Run model inference on the current sample
222
+ task_state = self.benchmark.run_inference(model=self.model, sample=sample, output_dir=model_prediction_dir)
223
+ return task_state
266
224
 
267
- if len(review_d[AnswerKeys.CHOICES]) == 0:
268
- logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
269
- continue
270
- elif len(review_d[AnswerKeys.CHOICES]) == 1:
271
- review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
272
- else:
273
- review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
274
-
275
- review_res_list.append(review_res)
225
+ def get_reviews(self, subset: str, task_states: List[TaskState]) -> List[SampleScore]:
226
+ """
227
+ Calculate evaluation metrics for model predictions.
276
228
 
277
- metric_score: List[dict] = self.data_adapter.compute_metric(review_res_list=review_res_list)
229
+ This method handles:
230
+ 1. Loading cached review results if available and caching is enabled
231
+ 2. Computing metrics for remaining task states in parallel
232
+ 3. Saving new review results to cache
278
233
 
279
- return metric_score
234
+ Args:
235
+ subset: Name of the subset being reviewed.
236
+ task_states: List of task states containing model predictions.
280
237
 
281
- def dump_report(self, reviews_score_all: List[dict], use_table: bool = True):
238
+ Returns:
239
+ List[SampleScore]: Evaluation scores for each sample.
240
+ """
241
+ # Initialize sample score list and filter cached reviews if caching is enabled
242
+ if self.use_cache and not self.task_config.rerun_review:
243
+ cached_score_list, task_states = self.cache_manager.filter_review_cache(subset, task_states)
244
+ else:
245
+ # Init a clean sample score list
246
+ cached_score_list = []
247
+ self.cache_manager.delete_review_cache(subset)
248
+
249
+ if not task_states:
250
+ return cached_score_list
251
+
252
+ logger.info(f'Reviewing {len(task_states)} samples, if data is large, it may take a while.')
253
+
254
+ def worker(task_state: TaskState) -> SampleScore:
255
+ return self._review_task_state(task_state)
256
+
257
+ def on_result(task_state: TaskState, sample_score: SampleScore) -> None:
258
+ review_result = self.cache_manager.save_review_cache(
259
+ subset=subset,
260
+ task_state=task_state,
261
+ sample_score=sample_score,
262
+ save_metadata=self.benchmark.save_metadata
263
+ )
264
+ logger.debug(f'Review result: \n{review_result.pretty_print()}')
265
+
266
+ def on_error(task_state: TaskState, exc: Exception) -> None:
267
+ tb_str = traceback.format_exc()
268
+ logger.error(f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}')
269
+ if self.task_config.ignore_errors:
270
+ logger.warning('Error ignored, continuing with next sample.')
271
+ return
272
+ raise exc
273
+
274
+ # Run reviews in parallel
275
+ reviewed_scores = run_in_threads_with_progress(
276
+ task_states,
277
+ worker,
278
+ desc=f'Reviewing[{self.benchmark_name}@{subset}]: ',
279
+ max_workers=self.task_config.judge_worker_num,
280
+ heartbeat_sec=HEARTBEAT_INTERVAL_SEC,
281
+ on_error=on_error,
282
+ # Do not persist interim results when batch scoring is enabled
283
+ on_result=None if self.benchmark.use_batch_scoring else on_result,
284
+ filter_none_results=False,
285
+ )
286
+
287
+ # Batch calculate metrics if supported by the benchmark
288
+ if self.benchmark.use_batch_scoring:
289
+ reviewed_scores = self._batch_review_task_states(
290
+ task_states=task_states, reviewed_scores=reviewed_scores, on_result=on_result
291
+ )
292
+
293
+ logger.info(f'Finished reviewing subset: {subset}. Total reviewed: {len(reviewed_scores)}')
294
+ return cached_score_list + reviewed_scores
295
+
296
+ def _review_task_state(self, task_state: TaskState) -> SampleScore:
282
297
  """
283
- Get report for total reviews of specific dataset.
284
- It is required to rewrite this method to support your own evaluator.
298
+ Helper method to review a single task state.
285
299
 
286
300
  Args:
287
- reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
288
- use_table: whether to generate table for reports. Default to True.
301
+ task_state: The task state to review.
289
302
 
290
- Returns: None
303
+ Returns:
304
+ SampleScore: The evaluation score for the task state.
291
305
  """
292
- # Get report map
293
- report_map: Report = self.data_adapter.gen_report(
294
- subset_score_map=reviews_score_all,
295
- report_name=self.custom_task_name,
296
- model_name=self.model_name,
297
- dataset_name=self.dataset_name)
298
-
299
- # Dump report
300
- report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
301
- self.dataset_name + '.json')
302
- os.makedirs(os.path.dirname(report_path), exist_ok=True)
303
-
304
- # Write report
305
- with open(report_path, 'w') as f:
306
- f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
307
- logger.info(f'Dump report: {report_path} \n')
308
-
309
- # Make table
310
- if use_table:
311
- try:
312
- report_table: str = gen_table([self.outputs_structure.reports_dir])
313
- logger.info(f'Report table: \n{report_table} \n')
314
- except Exception:
315
- logger.error('Failed to generate report table.')
316
- return report_map
317
-
318
- def eval(self, infer_cfg: dict = None, debug: bool = False, **kwargs) -> dict:
306
+ # Compute evaluation metrics using the benchmark's metric calculation
307
+ sample_score = self.benchmark.calculate_metrics(task_state=task_state)
308
+ return sample_score
309
+
310
+ def _batch_review_task_states(
311
+ self, task_states: List[TaskState], reviewed_scores: List[SampleScore],
312
+ on_result: Callable[[TaskState, SampleScore], None]
313
+ ) -> List[SampleScore]:
314
+ valid_indices = [i for i, score in enumerate(reviewed_scores) if score is not None]
315
+ if not valid_indices:
316
+ return reviewed_scores
317
+
318
+ task_states = [task_states[i] for i in valid_indices]
319
+ reviewed_scores = [reviewed_scores[i] for i in valid_indices]
320
+
321
+ # Iterate in batches with progress bar
322
+ all_reviewed_scores = []
323
+ total = len(task_states)
324
+ batch_size = self.task_config.judge_worker_num
325
+ with tqdm(total=total, desc='Scoring (batch)', unit='sample') as pbar:
326
+ for start in range(0, total, batch_size):
327
+ # Process batch
328
+ end = min(start + batch_size, total)
329
+ batch_task_states = task_states[start:end]
330
+ batch_scores = reviewed_scores[start:end]
331
+ # Batch calculate metrics
332
+ updated_reviewed_scores = self.benchmark.batch_calculate_metrics(
333
+ task_states=batch_task_states, sample_scores=batch_scores
334
+ )
335
+ # Append results
336
+ all_reviewed_scores.extend(updated_reviewed_scores)
337
+ # Save each result to cache
338
+ for task_state, sample_score in zip(batch_task_states, updated_reviewed_scores):
339
+ on_result(task_state, sample_score)
340
+
341
+ pbar.update(len(batch_task_states))
342
+ return all_reviewed_scores
343
+
344
+ def get_report(self, agg_score_dict: Dict[str, List[AggScore]]) -> Report:
319
345
  """
320
- Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
321
- It is required to rewrite this method to support your own evaluator.
346
+ Generate a comprehensive evaluation report from aggregated scores.
322
347
 
323
- The evaluation process is as follows:
324
- 1. Get the input samples from the dataset (benchmarks on the ModelScope or HuggingFace).
325
- 2. Get the input prompts from dataset with specific data adapter.
326
- 3. Get answers with model inference.
327
- 4. Get reviews with metric function (or reviewers).
328
- 5. Generate report from review results.
348
+ This method handles:
349
+ 1. Creating the evaluation report from scores
350
+ 2. Generating and displaying a summary table
351
+ 3. Optionally generating detailed analysis
352
+ 4. Saving the report to file
329
353
 
330
354
  Args:
331
- infer_cfg: The config for model inference.
332
- debug: Whether to run in debug mode. Default: False.
355
+ agg_score_dict: Dictionary mapping subset names to their aggregated scores.
333
356
 
334
357
  Returns:
335
- Dict of results. Depends on the stage of evaluation.
336
-
337
- stage == 'all': return the report_map
338
- stage == 'infer': return the answers_map
339
- stage == 'review': return the reviews_map
358
+ Report: The complete evaluation report.
340
359
  """
360
+ assert agg_score_dict, 'No scores to generate report from.'
361
+
362
+ # Get paths for saving the report
363
+ report_path = self.cache_manager.get_report_path()
364
+ report_file = self.cache_manager.get_report_file()
365
+
366
+ # Generate the main evaluation report using benchmark-specific logic
367
+ report = self.benchmark.generate_report(
368
+ scores=agg_score_dict, model_name=self.model_name, output_dir=report_path
369
+ )
370
+
371
+ # Generate and display a summary table of results
372
+ try:
373
+ report_table = gen_table(report_list=[report], add_overall_metric=self.benchmark.add_overall_metric)
374
+ logger.info(f'\n{self.benchmark_name} report table:'
375
+ f'\n{report_table} \n')
376
+ except Exception:
377
+ logger.error('Failed to generate report table.')
378
+
379
+ # Generate detailed analysis if requested in configuration
380
+ if self.task_config.analysis_report:
381
+ logger.info('Generating report analysis, please wait ...')
382
+ analysis = report.generate_analysis(self.task_config.judge_model_args)
383
+ logger.info(f'Report analysis:\n{analysis}')
384
+ else:
385
+ logger.info('Skipping report analysis (`analysis_report=False`).')
341
386
 
342
- logger.info(f'**** Start evaluating on dataset {self.dataset_name_or_path} ****')
343
-
344
- reviews_score_all = {} # {subset_name: (score, num)}
345
- stage_answers_dict = {}
346
- stage_reviews_dict = {}
347
-
348
- prompts = self.load_dataset()
349
- for subset_name, prompts_list in prompts.items():
350
- limit = kwargs.get('limit', len(prompts_list))
351
- prompts_list = prompts_list[:limit]
352
-
353
- answers_list: list = self.get_answers(
354
- subset_name=subset_name, prompts_list=prompts_list, infer_cfg=infer_cfg, debug=debug, **kwargs)
355
- if self.stage == EvalStage.INFER:
356
- stage_answers_dict[subset_name] = answers_list
357
- continue
358
-
359
- reviews_list: list = self.get_reviews(
360
- subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
361
-
362
- metric_res = self.compute_metrics(reviews_list=reviews_list)
363
- reviews_score_all[subset_name] = metric_res
364
- stage_reviews_dict[subset_name] = reviews_list
365
-
366
- if self.stage == EvalStage.INFER:
367
- return stage_answers_dict
368
-
369
- if self.stage == EvalStage.REVIEW:
370
- return stage_reviews_dict
371
-
372
- # Generate report
373
- report_map = self.dump_report(reviews_score_all)
374
-
375
- logger.info(f'**** Evaluation finished on {self.dataset_name_or_path} ****\n')
387
+ # Save the complete report to file
388
+ report.to_json(report_file)
389
+ logger.info(f'Dump report to: {report_file} \n')
390
+ return report
376
391
 
377
- return report_map
392
+ def finalize(self, *args, **kwargs):
393
+ self.benchmark.finalize(*args, **kwargs)