evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,754 @@
1
+ import os
2
+ from collections import defaultdict
3
+ from functools import partial
4
+ from overrides import override
5
+ from tqdm.auto import tqdm
6
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Type
7
+
8
+ from evalscope.api.dataset import DataLoader, Dataset, DatasetDict, LocalDataLoader, RemoteDataLoader, Sample
9
+ from evalscope.api.evaluator import TaskState
10
+ from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser
11
+ from evalscope.api.metric import AggScore, SampleScore, Score
12
+ from evalscope.api.model import Model, ModelOutput
13
+ from evalscope.api.registry import get_aggregation, get_metric
14
+ from evalscope.constants import HubType, JudgeStrategy
15
+ from evalscope.report import Report, ReportGenerator
16
+ from evalscope.utils import get_logger
17
+ from ..benchmark import DataAdapter
18
+
19
+ logger = get_logger()
20
+
21
+
22
+ class DefaultDataAdapter(DataAdapter):
23
+ """
24
+ Default Data Adapter for the benchmark evaluation system.
25
+
26
+ This class serves as the base implementation for data adapters that handle:
27
+ - Dataset loading and preprocessing
28
+ - Model inference execution
29
+ - Metric calculation and aggregation
30
+ - Report generation
31
+
32
+ The adapter follows a pipeline architecture with hooks that can be overridden
33
+ in subclasses to customize behavior for specific benchmarks or evaluation tasks.
34
+
35
+ Key responsibilities:
36
+ 1. Load datasets with optional few-shot examples
37
+ 2. Process samples and format prompts
38
+ 3. Execute model inference with proper state management
39
+ 4. Calculate evaluation metrics and aggregate results
40
+ 5. Generate comprehensive evaluation reports
41
+
42
+ This class can be extended to implement specific data loading and processing
43
+ logic for different benchmark datasets and evaluation scenarios.
44
+ """
45
+
46
+ # ####################
47
+ # DATA LOADING METHODS
48
+ # ####################
49
+
50
+ @override
51
+ def load_dataset(self) -> DatasetDict:
52
+ """
53
+ Load the complete dataset including test data and optional few-shot examples.
54
+
55
+ This method handles both local and remote dataset loading, processes samples
56
+ with appropriate prompt formatting, and prepares few-shot examples if needed.
57
+
58
+ Returns:
59
+ DatasetDict: A dictionary containing the loaded and processed datasets,
60
+ organized by subset names.
61
+ """
62
+ # Load the dataset
63
+ self.test_dataset, self.fewshot_dataset = self.load()
64
+
65
+ # Process each sample's input by applying prompt templates and few-shot formatting
66
+ self._post_process_samples()
67
+
68
+ return self.test_dataset
69
+
70
+ def load(self) -> Tuple[DatasetDict, Optional[DatasetDict]]:
71
+ """Load the dataset from disk or remote source.
72
+
73
+ Returns:
74
+ Tuple[DatasetDict, Optional[DatasetDict]]: The test dataset and few-shot dataset.
75
+ """
76
+ if os.path.exists(self.dataset_id):
77
+ # Load dataset from local file system path
78
+ with self._temporary_attribute('dataset_hub', HubType.LOCAL):
79
+ return self.load_from_disk()
80
+ else:
81
+ # Load dataset from remote source (e.g., ModelScope, Huggingface)
82
+ return self.load_from_remote()
83
+
84
+ def load_from_remote(self):
85
+ """Load dataset from remote source and prepare few-shot examples if needed."""
86
+ test_dataset = None
87
+ fewshot_dataset = None
88
+ # Load dataset from remote source
89
+ test_load_func = partial(self.load_subset, data_loader=RemoteDataLoader)
90
+ test_dataset = self.load_subsets(test_load_func)
91
+
92
+ # Load few-shot examples if few-shot prompting is enabled
93
+ if self._should_load_fewshot():
94
+ fewshot_load_func = partial(self.load_fewshot_subset, data_loader=RemoteDataLoader)
95
+ fewshot_dataset = self.load_subsets(fewshot_load_func, is_fewshot=True)
96
+ return test_dataset, fewshot_dataset
97
+
98
+ def load_from_disk(self, use_local_loader: bool = False):
99
+ """
100
+ Load dataset from local disk path.
101
+
102
+ Args:
103
+ use_local_loader: If True, use local file loading; otherwise use remote loading
104
+ for local ModelScope datasets.
105
+ """
106
+ test_dataset = None
107
+ fewshot_dataset = None
108
+ if use_local_loader:
109
+ # Use LocalDataLoader for actual local file loading
110
+ test_load_func = partial(self.load_subset, data_loader=LocalDataLoader)
111
+ test_dataset = self.load_subsets(test_load_func)
112
+
113
+ # Load few-shot examples if few-shot prompting is enabled
114
+ if self._should_load_fewshot():
115
+ fewshot_load_func = partial(self.load_fewshot_subset, data_loader=LocalDataLoader)
116
+ fewshot_dataset = self.load_subsets(fewshot_load_func, is_fewshot=True)
117
+ return test_dataset, fewshot_dataset
118
+ else:
119
+ # Fallback to remote loading for local ModelScope datasets
120
+ return self.load_from_remote()
121
+
122
+ def _should_load_fewshot(self) -> bool:
123
+ """Check if few-shot dataset should be loaded."""
124
+ return self.few_shot_num > 0 and self.train_split is not None
125
+
126
+ def _post_process_samples(self):
127
+ """Process all sample inputs with prompt formatting."""
128
+ for subset in self.test_dataset.keys():
129
+ for sample in self.test_dataset[subset]:
130
+ if isinstance(sample.input, str):
131
+ sample.input = self.process_sample_str_input(sample, subset)
132
+ elif isinstance(sample.input, list):
133
+ # Handle list[ChatMessage] and add system prompt if needed
134
+ sample.input = self.process_sample_messages_input(sample, subset)
135
+
136
+ def process_sample_str_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
137
+ """
138
+ Convert a sample's input string to a list of ChatMessage objects.
139
+
140
+ This method formats the sample input into a structured message format
141
+ suitable for model inference, including system prompts if configured.
142
+ """
143
+ input_text = self.process_sample_input(sample, subset=subset)
144
+ input_messages = [ChatMessageUser(content=input_text)]
145
+ if self.system_prompt:
146
+ input_messages.insert(0, ChatMessageSystem(content=self.system_prompt))
147
+ return input_messages
148
+
149
+ def process_sample_messages_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
150
+ """
151
+ Normalize a sample's existing List[ChatMessage] input and ensure system prompt is set once.
152
+ """
153
+ messages = list(sample.input) # shallow copy to avoid in-place mutations
154
+ if self.system_prompt and not any(isinstance(m, ChatMessageSystem) for m in messages):
155
+ messages = [ChatMessageSystem(content=self.system_prompt)] + messages
156
+ return messages
157
+
158
+ def process_sample_input(self, sample: Sample, subset: str) -> str:
159
+ """
160
+ Process a single sample's input by applying prompt templates and few-shot formatting.
161
+
162
+ This method handles the complete input preparation pipeline:
163
+ 1. Retrieves few-shot examples if enabled
164
+ 2. Formats few-shot examples into demonstration text
165
+ 3. Applies appropriate prompt template (with or without few-shot context)
166
+
167
+ Args:
168
+ sample (Sample): The sample to process
169
+ subset (str): The subset name this sample belongs to
170
+
171
+ Returns:
172
+ str: The formatted input text ready for model inference
173
+ """
174
+ if self.few_shot_num > 0:
175
+ if self.fewshot_dataset is not None:
176
+ # Retrieve few-shot examples for the current subset
177
+ few_shot_samples = self.fewshot_dataset.get(subset)
178
+ if few_shot_samples is None:
179
+ # Fallback: use the first available subset if current subset not found
180
+ first_key = next(iter(self.fewshot_dataset))
181
+ few_shot_samples = self.fewshot_dataset[first_key]
182
+ # Select fewshot samples
183
+ assert len(few_shot_samples) >= self.few_shot_num, (
184
+ f"""The dataset only have ({len(few_shot_samples)}) few-shot samples, but requested ({self.few_shot_num}) fewshot samples, please reduce 'few_shot_num'.""" # noqa: E501
185
+ )
186
+ # Convert few-shot samples to demonstration string
187
+ few_shot = '\n\n'.join([self.sample_to_fewshot(sample) for sample in few_shot_samples])
188
+ else:
189
+ # Build few-shot examples inside the format method
190
+ few_shot = ''
191
+ # Format the input text with few-shot examples and main prompt
192
+ input_text = self.format_fewshot_template(fewshot=few_shot, sample=sample)
193
+ else:
194
+ # No few-shot examples: use the prompt template directly
195
+ input_text = self.format_prompt_template(sample=sample)
196
+ return input_text
197
+
198
+ def load_subsets(self, load_func: Callable[[str], Dataset], is_fewshot=False) -> DatasetDict:
199
+ """
200
+ Load multiple subsets of the dataset using the provided loading function.
201
+
202
+ This method handles two loading strategies:
203
+ 1. Reformat mode: Load only the default subset and reformat it
204
+ 2. Multi-subset mode: Load all subsets specified in subset_list
205
+
206
+ Args:
207
+ load_func (Callable[[str], Dataset]): Function to load individual subsets
208
+
209
+ Returns:
210
+ DatasetDict: Dictionary containing all loaded subsets
211
+ """
212
+ if self.reformat_subset:
213
+ # Load only the default subset
214
+ subset_data = load_func(self.default_subset)
215
+ # Reformat the subset to create multiple subsets based on sample keys
216
+ # NOTE: subset_list and limit is applied here if specified
217
+ limit = self.few_shot_num if is_fewshot else self.limit
218
+ repeats = 1 if is_fewshot else self.repeats
219
+ dataset_dict = DatasetDict.from_dataset(
220
+ dataset=subset_data, subset_list=self.subset_list, limit=limit, repeats=repeats
221
+ )
222
+ else:
223
+ # Load all specified subsets into separate entries
224
+ subset_dict = defaultdict()
225
+ for subset in self.subset_list:
226
+ # Set current subset, since same benchmark need to differentiate
227
+ with self._temporary_attribute('current_subset_name', subset):
228
+ subset_data = load_func(subset)
229
+ subset_dict[subset] = subset_data
230
+ dataset_dict = DatasetDict(subset_dict)
231
+ return dataset_dict
232
+
233
+ def load_subset(self, subset: str, data_loader: Type[DataLoader]) -> Dataset:
234
+ """
235
+ Load a specific subset of the dataset for evaluation.
236
+
237
+ Args:
238
+ subset (str): The subset identifier to load
239
+ data_loader (Type[DataLoader]): The data loader class to use for loading
240
+
241
+ Returns:
242
+ Dataset: The loaded dataset subset with processed samples
243
+ """
244
+ # Determine the split and subset names based on configuration
245
+ split = subset if self.split_as_subset else self.eval_split
246
+ subset_name = self.default_subset if self.split_as_subset else subset
247
+
248
+ # Create and configure the remote data loader
249
+ loader = data_loader(
250
+ data_id_or_path=self.dataset_id,
251
+ split=split,
252
+ subset=subset_name,
253
+ sample_fields=self.record_to_sample, # Custom sample conversion function
254
+ filter_func=self.sample_filter,
255
+ limit=self.limit if not self.reformat_subset else None, # Limit number of samples if specified
256
+ repeats=self.repeats, # Number of repetitions for each sample
257
+ shuffle=self.shuffle, # Shuffle dataset if enabled
258
+ shuffle_choices=self.shuffle_choices, # Shuffle choices if requested
259
+ data_source=self.dataset_hub, # Data source configuration
260
+ )
261
+ dataset = loader.load()
262
+ return dataset
263
+
264
+ def load_fewshot_subset(self, subset: str, data_loader: Type[DataLoader]) -> Dataset:
265
+ """
266
+ Load a subset specifically for few-shot examples.
267
+
268
+ Args:
269
+ subset (str): The subset identifier to load few-shot examples from
270
+ data_loader (Type[DataLoader]): The data loader class to use for loading
271
+
272
+ Returns:
273
+ Dataset: The loaded few-shot dataset with demonstration examples
274
+ """
275
+ # Use training split for few-shot examples
276
+ split = subset if self.split_as_subset else self.train_split
277
+ subset_name = self.default_subset if self.split_as_subset else subset
278
+
279
+ # Create loader specifically configured for few-shot sampling
280
+ loader = data_loader(
281
+ data_id_or_path=self.dataset_id,
282
+ split=split,
283
+ subset=subset_name,
284
+ sample_fields=self.record_to_sample,
285
+ filter_func=self.sample_filter, # Apply sample filtering if defined
286
+ limit=self.few_shot_num
287
+ if not self.reformat_subset else None, # Limit to specified number of few-shot examples
288
+ shuffle=self.few_shot_random, # Randomize selection if enabled
289
+ shuffle_choices=self.shuffle_choices, # Shuffle choices if requested
290
+ data_source=self.dataset_hub,
291
+ )
292
+ dataset = loader.load()
293
+ return dataset
294
+
295
+ def sample_filter(self, sample: Sample) -> bool:
296
+ """
297
+ Apply filtering to a dataset, only samples matching the predicate will be included.
298
+
299
+ Args:
300
+ sample (Sample): The sample to filter
301
+
302
+ Returns:
303
+ bool: True if the sample passes the filter, False otherwise
304
+ """
305
+ return True # Default implementation allows all samples
306
+
307
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
308
+ """
309
+ Convert a raw data record to a Sample object.
310
+
311
+ This method must be implemented in subclasses to handle dataset-specific
312
+ field mapping and data processing logic.
313
+
314
+ Args:
315
+ record (Dict[str, Any]): Raw data record from the dataset
316
+
317
+ Returns:
318
+ Sample: Processed sample object ready for evaluation
319
+ """
320
+ raise NotImplementedError('This method should be implemented in subclasses')
321
+
322
+ def sample_to_fewshot(self, sample: Sample) -> str:
323
+ """
324
+ Convert a Sample object to a formatted few-shot demonstration string.
325
+
326
+ This method must be implemented in subclasses to define how samples
327
+ are formatted as examples in few-shot prompts.
328
+
329
+ Args:
330
+ sample (Sample): The sample to convert to a few-shot example
331
+
332
+ Returns:
333
+ str: Formatted few-shot demonstration string
334
+ """
335
+ raise NotImplementedError('This method should be implemented in subclasses')
336
+
337
+ def format_prompt_template(self, sample: Sample) -> str:
338
+ """
339
+ Format the basic prompt template with the sample data.
340
+
341
+ This method applies the prompt template to format the input text
342
+ for models when no few-shot examples are used.
343
+
344
+ Args:
345
+ sample (Sample): The sample object containing the prompt data
346
+
347
+ Returns:
348
+ str: The formatted prompt ready for model input
349
+ """
350
+ return self.prompt_template.format(question=sample.input)
351
+
352
+ def format_fewshot_template(self, fewshot: str, sample: Sample) -> str:
353
+ """
354
+ Format the few-shot template with demonstrations and the main prompt.
355
+
356
+ This method combines few-shot examples with the main prompt using
357
+ the configured few-shot template.
358
+
359
+ Args:
360
+ fewshot (str): The formatted few-shot demonstration examples
361
+ sample (Sample): The sample object containing the prompt data
362
+
363
+ Returns:
364
+ str: The complete formatted input with few-shot context
365
+ """
366
+ return self.few_shot_prompt_template.format(fewshot=fewshot, question=sample.input)
367
+
368
+ # #################
369
+ # INFERENCE METHODS
370
+ # #################
371
+
372
+ def _on_inference_start(self, model: Model, sample: Sample) -> None:
373
+ """
374
+ Hook method called before inference starts.
375
+
376
+ This method can be overridden in subclasses to implement custom
377
+ preparation logic before model inference (e.g., model configuration,
378
+ sample preprocessing, state initialization).
379
+
380
+ Args:
381
+ model (Model): The model that will perform inference
382
+ sample (Sample): The sample to be processed
383
+ """
384
+ pass
385
+
386
+ def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
387
+ """
388
+ Hook method called during the actual inference process.
389
+
390
+ This method executes the model inference and can be overridden
391
+ to implement custom inference logic or model interaction patterns.
392
+
393
+ Args:
394
+ model (Model): The model to use for inference
395
+ sample (Sample): The sample to process
396
+
397
+ Returns:
398
+ ModelOutput: The raw output from the model
399
+ """
400
+ # Execute model inference with the processed input and any tools
401
+ model_output = model.generate(input=sample.input, tools=sample.tools)
402
+ return model_output
403
+
404
+ def _on_inference_end(
405
+ self, model: Model, sample: Sample, model_output: ModelOutput, output_dir: str, **kwargs
406
+ ) -> TaskState:
407
+ """
408
+ Hook method called after inference completes.
409
+
410
+ This method processes the model output and creates a TaskState object
411
+ that encapsulates all information about the completed inference task.
412
+ You can save the model output to the specified output directory.
413
+
414
+ Args:
415
+ model (Model): The model that performed inference
416
+ sample (Sample): The processed sample
417
+ model_output (ModelOutput): The raw model output
418
+ output_dir (str): The directory where the model output was saved
419
+
420
+ Returns:
421
+ TaskState: Complete state object for the inference task
422
+ """
423
+ return TaskState(
424
+ model=model.name,
425
+ sample=sample,
426
+ messages=[model_output.message],
427
+ output=model_output,
428
+ completed=True,
429
+ )
430
+
431
+ @override
432
+ def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
433
+ """
434
+ Execute the complete inference pipeline for a single sample.
435
+
436
+ This method orchestrates the full inference process using the hook methods:
437
+ 1. Pre-inference preparation
438
+ 2. Model inference execution
439
+ 3. Post-inference processing and state creation
440
+
441
+ Args:
442
+ model (Model): The model to use for inference
443
+ sample (Sample): The sample to process
444
+ output_dir (str): The directory to store the generated files
445
+
446
+ Returns:
447
+ TaskState: Complete state object containing inference results
448
+ """
449
+ self._on_inference_start(model, sample)
450
+ model_output = self._on_inference(model, sample)
451
+ task_state = self._on_inference_end(model, sample, model_output, output_dir, **kwargs)
452
+
453
+ return task_state
454
+
455
+ # ##########################
456
+ # METRIC CALCULATION METHODS
457
+ # ##########################
458
+
459
+ def filter_prediction(self, prediction: str, task_state: TaskState) -> str:
460
+ """
461
+ Filter and prepare the model prediction for metric calculation.
462
+
463
+ This method applies configured filters and custom answer extraction
464
+ to clean and prepare the raw model output for evaluation.
465
+
466
+ Args:
467
+ prediction (str): The raw model prediction
468
+ task_state (TaskState): The complete task state for context
469
+
470
+ Returns:
471
+ str: The filtered and extracted prediction ready for evaluation
472
+ """
473
+ if self.filter_ensemble is not None:
474
+ # Apply configured filters to clean the prediction
475
+ prediction = self.filter_ensemble(prediction)
476
+
477
+ # Apply custom answer extraction logic
478
+ extracted_prediction = self.extract_answer(prediction, task_state)
479
+ return extracted_prediction
480
+
481
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
482
+ """
483
+ Hook method for custom answer extraction from model predictions.
484
+
485
+ This method can be overridden in subclasses to implement specific
486
+ logic for extracting the final answer from complex model outputs.
487
+
488
+ Args:
489
+ prediction (str): The model prediction to extract from
490
+ task_state (TaskState): The task state for additional context
491
+
492
+ Returns:
493
+ str: The extracted answer
494
+ """
495
+ return prediction
496
+
497
+ def match_score(
498
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
499
+ ) -> Score:
500
+ """
501
+ Calculate evaluation scores by comparing prediction with reference.
502
+
503
+ This method computes scores using all configured metrics and creates
504
+ a comprehensive Score object with detailed evaluation results.
505
+
506
+ Args:
507
+ original_prediction (str): The original, unfiltered model prediction
508
+ filtered_prediction (str): The filtered and processed prediction
509
+ reference (str): The ground truth reference answer
510
+ task_state (TaskState): The complete task state for context
511
+
512
+ Returns:
513
+ Score: Object containing all calculated metric scores and metadata
514
+ """
515
+ # Initialize the score object with prediction details
516
+ score = Score(
517
+ extracted_prediction=filtered_prediction,
518
+ prediction=original_prediction,
519
+ )
520
+
521
+ # Calculate scores for each configured metric
522
+ for metric in self.metric_list:
523
+ try:
524
+ if isinstance(metric, str):
525
+ metric_name = metric
526
+ metric_scorer = get_metric(metric) # Get metric implementation from registry
527
+ metric_func = metric_scorer() # Instantiate the metric scorer
528
+ elif isinstance(metric, dict):
529
+ metric_name = list(metric.keys())[0]
530
+ metric_cls = get_metric(metric_name)
531
+ metric_func = metric_cls(**metric[metric_name]) # Initialize with parameters
532
+ metric_score = metric_func(
533
+ prediction=filtered_prediction,
534
+ reference=reference,
535
+ )
536
+ score.value[metric_name] = metric_score
537
+ except Exception as e:
538
+ logger.error(f'Error calculating metric {metric}: {e}')
539
+ score.value[metric_name] = 0
540
+ score.metadata[metric_name] = f'error: {str(e)}'
541
+
542
+ return score
543
+
544
+ @override
545
+ def calculate_metrics(self, task_state: TaskState) -> SampleScore:
546
+ """
547
+ Calculate comprehensive evaluation metrics for a completed task.
548
+
549
+ This method processes the task state to extract predictions, applies
550
+ filtering and answer extraction, calculates all configured metrics,
551
+ and packages the results into a SampleScore object.
552
+
553
+ Args:
554
+ task_state (TaskState): The completed task state to evaluate
555
+
556
+ Returns:
557
+ SampleScore: Complete scoring results for the sample
558
+
559
+ Raises:
560
+ AssertionError: If the task state is not marked as completed
561
+ """
562
+ assert task_state.completed, \
563
+ 'TaskState must be completed before calculating metrics.'
564
+
565
+ # Extract the raw prediction from the model output
566
+ prediction = task_state.output.completion
567
+
568
+ # Apply filtering and answer extraction
569
+ filtered_prediction = self.filter_prediction(prediction, task_state)
570
+
571
+ if self.judge_strategy == JudgeStrategy.LLM_RECALL:
572
+ # Step 1: Calculate standard metric scores (rule-based)
573
+ rule_based_score = self.match_score(
574
+ original_prediction=prediction,
575
+ filtered_prediction=filtered_prediction,
576
+ reference=task_state.target,
577
+ task_state=task_state
578
+ )
579
+
580
+ # Step 2: Apply LLM judge if enabled and get final score
581
+ final_score = self.maybe_llm_match_score(
582
+ original_prediction=prediction,
583
+ filtered_prediction=filtered_prediction,
584
+ reference=task_state.target,
585
+ task_state=task_state,
586
+ rule_based_score=rule_based_score
587
+ )
588
+ else:
589
+ if self.use_llm_judge:
590
+ # Use LLM judge to compute the match score directly
591
+ final_score = self.llm_match_score(
592
+ original_prediction=prediction,
593
+ filtered_prediction=filtered_prediction,
594
+ reference=task_state.target,
595
+ task_state=task_state
596
+ )
597
+ else:
598
+ # Use standard match score calculation without LLM judge
599
+ final_score = self.match_score(
600
+ original_prediction=prediction,
601
+ filtered_prediction=filtered_prediction,
602
+ reference=task_state.target,
603
+ task_state=task_state
604
+ )
605
+
606
+ # Package the results into a sample score object
607
+ sample_score = SampleScore(
608
+ score=final_score,
609
+ sample_id=task_state.sample_id,
610
+ group_id=task_state.group_id,
611
+ sample_metadata=task_state.metadata,
612
+ )
613
+
614
+ return sample_score
615
+
616
+ def batch_match_score(
617
+ self, original_predictions: List[str], filtered_predictions: List[str], references: List[str],
618
+ task_states: List[TaskState]
619
+ ) -> Optional[List[Score]]:
620
+ """
621
+ Batch calculate evaluation scores by comparing predictions with references.
622
+
623
+ This method computes scores using all configured metrics for a batch of samples
624
+ and creates a list of Score objects with detailed evaluation results.
625
+
626
+ Args:
627
+ original_predictions (List[str]): The original, unfiltered model predictions
628
+ filtered_predictions (List[str]): The filtered and processed predictions
629
+ references (List[str]): The ground truth reference answers
630
+ task_states (List[TaskState]): The complete task states for context
631
+
632
+ Returns:
633
+ List[Score]: List of objects containing all calculated metric scores and metadata
634
+ """
635
+ return None # Default implementation does not support batch scoring
636
+
637
+ @override
638
+ def batch_calculate_metrics(self, task_states: List[TaskState],
639
+ sample_scores: List[SampleScore]) -> List[SampleScore]:
640
+ """Batch calculate metrics for a list of task states with tqdm progress and batch processing."""
641
+ total = len(task_states)
642
+ if total == 0:
643
+ return sample_scores
644
+
645
+ # Prepare lists for batch processing
646
+ original_predictions: List[str] = []
647
+ filtered_predictions: List[str] = []
648
+ references: List[str] = []
649
+
650
+ for ts in task_states:
651
+ pred = ts.output.completion
652
+ original_predictions.append(pred)
653
+ filtered_predictions.append(self.filter_prediction(pred, ts))
654
+ references.append(ts.target)
655
+
656
+ batch_scores = self.batch_match_score(
657
+ original_predictions=original_predictions,
658
+ filtered_predictions=filtered_predictions,
659
+ references=references,
660
+ task_states=task_states
661
+ )
662
+
663
+ if batch_scores is not None:
664
+ assert len(batch_scores) == len(sample_scores), \
665
+ 'Batch scores length must match sample scores length.'
666
+ for batch_score, sample_score in zip(batch_scores, sample_scores):
667
+ sample_score.score.value.update(batch_score.value)
668
+
669
+ return sample_scores
670
+
671
+ @override
672
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
673
+ """
674
+ Aggregate individual sample scores into summary statistics.
675
+
676
+ This method uses the configured aggregation method to compute
677
+ summary statistics (e.g., mean, median, percentiles) across
678
+ all sample scores for comprehensive evaluation results.
679
+
680
+ Args:
681
+ sample_scores (List[SampleScore]): Individual scores for all samples
682
+
683
+ Returns:
684
+ List[AggScore]: Aggregated scores and statistics
685
+ """
686
+ # Get the configured aggregation implementation
687
+ aggregate_cls = get_aggregation(self.aggregation)
688
+ aggregator = aggregate_cls()
689
+
690
+ # Compute aggregated scores
691
+ agg_scores = aggregator(sample_scores)
692
+
693
+ return agg_scores
694
+
695
+ # #########################
696
+ # REPORT GENERATION METHODS
697
+ # #########################
698
+
699
+ def _on_generate_report_end(self, report: Report, output_dir: str, **kwargs) -> None:
700
+ """
701
+ Hook method called after generating the evaluation report.
702
+
703
+ This method can be overridden in subclasses to implement custom
704
+ post-processing of the generated report (e.g., additional formatting,
705
+ custom visualizations, external integrations).
706
+
707
+ Args:
708
+ report (Report): The generated evaluation report
709
+ output_dir (str): Directory where the report should be saved
710
+ """
711
+ pass
712
+
713
+ def _on_generate_report(self, scores: Dict[str, List[AggScore]], model_name: str) -> Report:
714
+ """
715
+ Hook method called during report generation.
716
+
717
+ This method creates the evaluation report using the configured
718
+ report generator and can be overridden to implement custom
719
+ report generation logic.
720
+
721
+ Args:
722
+ scores (Dict[str, List[AggScore]]): Aggregated scores organized by subset
723
+ model_name (str): Name of the evaluated model
724
+
725
+ Returns:
726
+ Report: The generated evaluation report
727
+ """
728
+ return ReportGenerator.generate_report(
729
+ score_dict=scores, model_name=model_name, data_adapter=self, add_aggregation_name=self.add_aggregation_name
730
+ )
731
+
732
+ @override
733
+ def generate_report(self, scores: Dict[str, List[AggScore]], model_name: str, output_dir: str, **kwargs) -> Report:
734
+ """
735
+ Generate a comprehensive evaluation report from aggregated scores.
736
+
737
+ This method orchestrates the complete report generation process:
738
+ 1. Creates the report using configured generators
739
+ 2. Applies any post-processing through hook methods
740
+
741
+ Args:
742
+ scores (Dict[str, List[AggScore]]): Aggregated scores by subset name
743
+ model_name (str): Name of the model being evaluated
744
+
745
+ Returns:
746
+ Report: Complete evaluation report with results and analysis
747
+ """
748
+ report = self._on_generate_report(scores, model_name=model_name)
749
+ self._on_generate_report_end(report, output_dir, **kwargs)
750
+ return report
751
+
752
+ def finalize(self, *args, **kwargs):
753
+ # Finalize the evaluation process
754
+ self.sandbox_finalize(*args, **kwargs)