evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,454 @@
1
+ # flake8: noqa: E501
2
+ import glob
3
+ import os
4
+ from collections import defaultdict
5
+ from typing import Any, Dict, List
6
+
7
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
8
+ from evalscope.api.dataset import DatasetDict, DictDataLoader, Sample
9
+ from evalscope.api.evaluator import TaskState
10
+ from evalscope.api.messages.chat_message import ChatMessageUser
11
+ from evalscope.api.metric import AggScore, SampleScore, Score
12
+ from evalscope.api.registry import register_benchmark
13
+ from evalscope.constants import Tags
14
+ from evalscope.report import Report, ReportKey
15
+ from evalscope.utils.logger import get_logger
16
+
17
+ logger = get_logger()
18
+
19
+ GRADER_SYSTEM_PROMPT = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"." # noqa: E501
20
+
21
+ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>".strip(
22
+ ) # noqa: E501
23
+
24
+
25
+ @register_benchmark(
26
+ BenchmarkMeta(
27
+ name='general_arena',
28
+ pretty_name='GeneralArena',
29
+ tags=[Tags.CUSTOM, Tags.ARENA],
30
+ description=
31
+ 'GeneralArena is a custom benchmark designed to evaluate the performance of large language models in a competitive setting, '
32
+ 'where models are pitted against each other in custom tasks to determine their relative strengths and weaknesses. You should '
33
+ 'provide the model outputs in the format of a list of dictionaries, where each dictionary contains the model name and its report path. '
34
+ 'For detailed instructions on how to use this benchmark, please refer to the [Arena User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html).',
35
+ dataset_id='general_arena',
36
+ metric_list=['winrate'],
37
+ aggregation='elo',
38
+ few_shot_num=0,
39
+ train_split=None,
40
+ eval_split='test',
41
+ system_prompt=GRADER_SYSTEM_PROMPT,
42
+ prompt_template=GRADER_TEMPLATE,
43
+ extra_params={
44
+ 'models': [{
45
+ 'name': 'qwen-plus',
46
+ 'report_path': 'outputs/20250627_172550/reports/qwen-plus'
47
+ }, {
48
+ 'name': 'qwen2.5-7b',
49
+ 'report_path': 'outputs/20250627_172817/reports/qwen2.5-7b-instruct'
50
+ }],
51
+ 'baseline':
52
+ 'qwen2.5-7b'
53
+ }
54
+ )
55
+ )
56
+ class GeneralArenaAdapter(DefaultDataAdapter):
57
+
58
+ def __init__(self, *args, **kwargs):
59
+ super().__init__(*args, **kwargs)
60
+
61
+ self._use_llm_judge = True
62
+
63
+ self.models = self.extra_params.get('models', [])
64
+ self.baseline = self.extra_params.get('baseline', None)
65
+
66
+ def load(self):
67
+ """Load dataset by processing model reports."""
68
+ self._check_names()
69
+ self._check_reports()
70
+ self._check_datasets()
71
+ logger.info(f'Overall datasets: {self.overall_datasets}')
72
+ dataset_model_dict = self._load_common_datasets()
73
+ datasets = self._build_pair_wise_data(dataset_model_dict)
74
+
75
+ # Convert to DatasetDict format
76
+ dataset_dict = {}
77
+ for subset_name, samples in datasets.items():
78
+ dataset = DictDataLoader(
79
+ dict_list=samples,
80
+ limit=self.limit,
81
+ shuffle=self.shuffle,
82
+ repeats=self.repeats,
83
+ sample_fields=self.record_to_sample
84
+ ).load()
85
+ dataset_dict[subset_name] = dataset
86
+
87
+ test_dataset = DatasetDict(dataset_dict)
88
+ return test_dataset, None
89
+
90
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
91
+ """Convert a data record to a Sample object."""
92
+ return Sample(
93
+ input=[ChatMessageUser(content=record['question'])],
94
+ target=record['answer_2'], # baseline answer
95
+ metadata={
96
+ 'answer_1': record['answer_1'],
97
+ 'model_1': record['model_1'],
98
+ 'model_2': record['model_2'],
99
+ }
100
+ )
101
+
102
+ def _check_names(self):
103
+ """Check the names of the models and baseline."""
104
+ # check duplicate models
105
+ model_names = [model['name'] for model in self.models]
106
+ if len(model_names) != len(set(model_names)):
107
+ raise ValueError(f'Duplicate model names found in the models list {model_names}.')
108
+ # check if models list is empty
109
+ if len(self.models) < 2:
110
+ raise ValueError('Models list must contain at least two models.')
111
+ # check baseline model
112
+ if self.baseline and self.baseline not in model_names:
113
+ raise ValueError(f'Baseline model {self.baseline} not found in the models list.')
114
+ # check if the baseline model is not set
115
+ if not self.baseline:
116
+ logger.warning('Baseline model is not set. Using the first model as the baseline.')
117
+ self.baseline = self.models[0]['name']
118
+
119
+ def _check_reports(self):
120
+ """Check if the report paths are valid."""
121
+ for model in self.models:
122
+ report_path = model.get('report_path', None)
123
+ if not report_path or not os.path.exists(report_path):
124
+ raise ValueError(f'Report path {report_path} for model {model["name"]} does not exist.')
125
+ reports = []
126
+ for report_item in glob.glob(os.path.join(report_path, '*.json')):
127
+ report = Report.from_json(report_item)
128
+ reports.append(report)
129
+ model['reports'] = reports
130
+
131
+ def _check_datasets(self):
132
+ """Check common datasets in the reports."""
133
+ overall_datasets = set()
134
+ for model in self.models:
135
+ datasets = set()
136
+ for report in model['reports']:
137
+ report_df = report.to_dataframe()
138
+ # get unique (dataset, subset) tuples
139
+ unique_datasets = set(zip(report_df[ReportKey.dataset_name], report_df[ReportKey.subset_name]))
140
+ datasets.update(unique_datasets)
141
+ model['datasets'] = datasets
142
+ # get overall datasets by intersecting all models' datasets
143
+ overall_datasets = set.intersection(*[model['datasets'] for model in self.models if 'datasets' in model])
144
+ self.overall_datasets = overall_datasets
145
+
146
+ def _load_common_datasets(self):
147
+ """Load common datasets from the local path."""
148
+ from evalscope.utils import OutputsStructure
149
+ from evalscope.utils.io_utils import jsonl_to_list
150
+
151
+ dataset_dict = defaultdict(dict)
152
+ for dataset_name, subset_name in self.overall_datasets:
153
+ for model in self.models:
154
+ dataset_path = model['report_path'].replace(OutputsStructure.REPORTS_DIR, OutputsStructure.REVIEWS_DIR)
155
+ dataset_file_path = os.path.join(dataset_path, f'{dataset_name}_{subset_name}.jsonl')
156
+ if not os.path.exists(dataset_file_path):
157
+ raise ValueError(
158
+ f'Dataset {dataset_name} with subset {subset_name} not found in model {model["name"]}.'
159
+ )
160
+ dataset = jsonl_to_list(dataset_file_path)
161
+ # sort by index
162
+ dataset.sort(key=lambda x: x.get('index'))
163
+ dataset_dict[(dataset_name, subset_name)][model['name']] = dataset
164
+
165
+ return dataset_dict
166
+
167
+ def _build_pair_wise_data(self, dataset_dict):
168
+ """Build pairwise data for the models."""
169
+ from evalscope.api.evaluator import ReviewResult
170
+ from .utils import process_review_item
171
+
172
+ pairwise_data = defaultdict(list)
173
+ for (dataset_name, subset_name), model_data in dataset_dict.items():
174
+ if len(model_data) < 2:
175
+ logger.warning(f'Not enough models for dataset {dataset_name} with subset {subset_name}. Skipping.')
176
+ continue
177
+ # create pairwise data for each model against the baseline
178
+ model_names = list(model_data.keys())
179
+ for name in model_names:
180
+ if name == self.baseline:
181
+ continue
182
+ pairs = []
183
+ for model_item, baseline_item in zip(model_data[name], model_data[self.baseline]):
184
+ # Convert to ReviewResult objects like in get_model_prediction
185
+ model_review = ReviewResult.model_validate(model_item)
186
+ baseline_review = ReviewResult.model_validate(baseline_item)
187
+
188
+ for model_choice, baseline_choice in zip(
189
+ process_review_item(model_review), process_review_item(baseline_review)
190
+ ):
191
+ pairs.append({
192
+ 'question': model_choice['Question'],
193
+ 'answer_1': model_choice['Generated'],
194
+ 'answer_2': baseline_choice['Generated'],
195
+ 'model_1': name,
196
+ 'model_2': self.baseline
197
+ })
198
+ pairwise_data[f'{dataset_name}&{subset_name}@{name}&{self.baseline}'] = pairs
199
+
200
+ return pairwise_data
201
+
202
+ def llm_match_score(
203
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
204
+ ) -> Score:
205
+ """Use LLM as a judge to evaluate the predicted answer against the baseline."""
206
+ from .utils import get_judge_score, post_process_result
207
+
208
+ score = Score(
209
+ extracted_prediction=filtered_prediction,
210
+ prediction=original_prediction,
211
+ )
212
+
213
+ question = task_state.input_text
214
+ answer_1 = task_state.metadata['answer_1']
215
+ answer_2 = reference # baseline answer
216
+ model_1 = task_state.metadata['model_1']
217
+ model_2 = task_state.metadata['model_2']
218
+
219
+ system_template = self.system_prompt
220
+ prompt_template = self.prompt_template
221
+
222
+ prompt1 = prompt_template.format(question=question, answer_1=answer_1, answer_2=answer_2)
223
+ # reverse the order
224
+ prompt2 = prompt_template.format(question=question, answer_1=answer_2, answer_2=answer_1)
225
+
226
+ # get grading response
227
+ game1_response = self.llm_judge.judge(prompt1, system_prompt=system_template)
228
+ game2_response = self.llm_judge.judge(prompt2, system_prompt=system_template)
229
+
230
+ # parse grading response
231
+ # game1
232
+ res1 = post_process_result(game1_response)
233
+ score1 = get_judge_score(res1, reverse=False)
234
+ # game2
235
+ res2 = post_process_result(game2_response)
236
+ score2 = get_judge_score(res2, reverse=True)
237
+
238
+ battle_result = {
239
+ 'score': (score1 + score2) / 2,
240
+ 'games': [
241
+ {
242
+ 'model_a': model_1,
243
+ 'model_b': model_2,
244
+ 'response': game1_response,
245
+ 'judgment': res1
246
+ },
247
+ {
248
+ 'model_a': model_2,
249
+ 'model_b': model_1,
250
+ 'response': game2_response,
251
+ 'judgment': res2
252
+ },
253
+ ]
254
+ }
255
+
256
+ score.value = {'score': battle_result['score']}
257
+ score.explanation = f'LLM judge battles: Game1: {game1_response[:100]}... Game2: {game2_response[:100]}...'
258
+ score.metadata = {
259
+ 'source': 'llm_judge',
260
+ 'judge_strategy': getattr(self, 'judge_strategy', 'default'),
261
+ 'model': self.llm_judge.model_id if hasattr(self.llm_judge, 'model_id') else 'unknown',
262
+ 'battle_result': battle_result
263
+ }
264
+ score.main_score_name = 'score'
265
+
266
+ return score
267
+
268
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
269
+ """Aggregate scores to compute winrate."""
270
+ import numpy as np
271
+ import pandas as pd
272
+
273
+ from .utils import compute_mle_elo, get_battles_from_row, get_bootstrap_result, get_win_rate_column
274
+
275
+ battles = pd.concat([get_battles_from_row(res.score.metadata['battle_result']) for res in sample_scores])
276
+
277
+ bt_model_coef = compute_mle_elo(battles, baseline_model=self.baseline)
278
+
279
+ bootstrap_model_coef = get_bootstrap_result(
280
+ battles, func_compute_elo=compute_mle_elo, num_round=100, baseline_model=self.baseline
281
+ )
282
+
283
+ stats = pd.DataFrame()
284
+ stats['results'] = None
285
+ stats['results'] = stats['results'].astype('object')
286
+
287
+ for i, model in enumerate(bt_model_coef.index):
288
+ stats.at[i, 'model'] = model
289
+ stats.at[i, 'score'] = bt_model_coef[model]
290
+ stats.at[i, 'lower'] = np.percentile(bootstrap_model_coef[model], 2.5)
291
+ stats.at[i, 'upper'] = np.percentile(bootstrap_model_coef[model], 97.5)
292
+
293
+ metrics_dict = {}
294
+ metrics_dict['winrate'] = get_win_rate_column(stats, 'score', self.baseline).to_dict()
295
+ metrics_dict['winrate_lower'] = get_win_rate_column(stats, 'lower', self.baseline).to_dict()
296
+ metrics_dict['winrate_upper'] = get_win_rate_column(stats, 'upper', self.baseline).to_dict()
297
+
298
+ agg_scores = []
299
+ for metric_name, models in metrics_dict.items():
300
+ for model_name, score_val in models.items():
301
+ if model_name == self.baseline:
302
+ continue
303
+ agg_scores.append(AggScore(score=score_val, metric_name=metric_name, num=len(sample_scores)))
304
+
305
+ return agg_scores
306
+
307
+ def extract_answer(self, prediction, task_state):
308
+ # NOTE: This is a hacky way to extract the answer from the prediction
309
+ return task_state.metadata['answer_1']
310
+
311
+ def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
312
+ """Post-process the report to convert it to a DataFrame with winrate leaderboards."""
313
+ import pandas as pd
314
+ import tabulate
315
+
316
+ report_path = output_dir
317
+ leaderboard_file = os.path.join(report_path, 'leaderboard.txt')
318
+
319
+ # Ensure report directory exists
320
+ os.makedirs(report_path, exist_ok=True)
321
+
322
+ # Convert report to dataframe
323
+ df = report.to_dataframe()
324
+
325
+ # Filter for winrate-related metrics
326
+ winrate_df = df[df[ReportKey.metric_name].str.contains('winrate')].copy()
327
+
328
+ if winrate_df.empty:
329
+ logger.warning('No winrate data found in the report.')
330
+ return
331
+
332
+ # Get all model names from self.models
333
+ all_model_names = [model['name'] for model in self.models]
334
+
335
+ # Collect all leaderboard outputs
336
+ leaderboard_outputs = []
337
+
338
+ def format_leaderboard(data_df, title):
339
+ """Format DataFrame as leaderboard with CI."""
340
+ # Pivot to get winrate, winrate_lower, winrate_upper as columns
341
+ pivot_df = data_df.pivot_table(
342
+ index=[ReportKey.model_name], columns=ReportKey.metric_name, values=ReportKey.score, aggfunc='first'
343
+ )
344
+
345
+ # Add baseline model with 50% winrate
346
+ baseline_data = {'winrate': 0.5, 'winrate_lower': 0.5, 'winrate_upper': 0.5}
347
+
348
+ # Create a complete index with all models
349
+ complete_index = pd.Index(all_model_names, name=pivot_df.index.name)
350
+ pivot_df = pivot_df.reindex(complete_index)
351
+
352
+ # Fill baseline model data
353
+ if self.baseline in pivot_df.index:
354
+ for col, val in baseline_data.items():
355
+ if col in pivot_df.columns:
356
+ pivot_df.loc[self.baseline, col] = val
357
+
358
+ # Fill missing values with winrate score for other models
359
+ if 'winrate' in pivot_df.columns:
360
+ pivot_df['winrate_lower'] = pivot_df.get('winrate_lower', pivot_df['winrate'])
361
+ pivot_df['winrate_upper'] = pivot_df.get('winrate_upper', pivot_df['winrate'])
362
+
363
+ # Format for display
364
+ leaderboard_data = []
365
+ for model in pivot_df.index:
366
+ if pd.isna(pivot_df.loc[model, 'winrate']):
367
+ continue
368
+
369
+ score_pct = pivot_df.loc[model, 'winrate'] * 100
370
+ lower_diff = (pivot_df.loc[model, 'winrate_lower'] - pivot_df.loc[model, 'winrate']) * 100
371
+ upper_diff = (pivot_df.loc[model, 'winrate_upper'] - pivot_df.loc[model, 'winrate']) * 100
372
+
373
+ leaderboard_data.append({
374
+ 'Model': model,
375
+ 'WinRate (%)': f'{score_pct:.1f}',
376
+ 'CI (%)': f'({lower_diff:+.1f} / {upper_diff:+.1f})'
377
+ })
378
+
379
+ # Sort by score descending
380
+ leaderboard_data.sort(key=lambda x: float(x['WinRate (%)'].replace('%', '')), reverse=True)
381
+
382
+ # Create DataFrame
383
+ leaderboard_df = pd.DataFrame(leaderboard_data)
384
+ leaderboard_df.index = range(len(leaderboard_df))
385
+
386
+ # Format as string
387
+ table_str = tabulate.tabulate(leaderboard_df, headers='keys', showindex=False)
388
+ output = f'{title}\n{table_str}\n'
389
+
390
+ logger.info(f'\n{title}\n{table_str}')
391
+ return output
392
+
393
+ # Parse dataset and subset information from dataset_name column
394
+ # Format: '{dataset_name}&{subset_name}@{name}&{self.baseline}'
395
+ def parse_dataset_key(dataset_key):
396
+ """Parse dataset key to extract dataset_name, subset_name, and model pair."""
397
+ parts = dataset_key.split('@')
398
+
399
+ dataset_subset = parts[0]
400
+ model_pair = parts[1]
401
+
402
+ dataset_name, subset_name = dataset_subset.split('&', 1)
403
+ model_1, model_2 = model_pair.split('&', 1)
404
+
405
+ return dataset_name, subset_name, model_1, model_2
406
+
407
+ # Add parsed columns
408
+ parsed_data = []
409
+ for _, row in winrate_df.iterrows():
410
+ dataset_name, subset_name, model_1, model_2 = parse_dataset_key(row[ReportKey.subset_name])
411
+ if dataset_name is not None:
412
+ parsed_data.append({
413
+ 'dataset_name': dataset_name,
414
+ 'subset_name': subset_name,
415
+ ReportKey.model_name: model_1,
416
+ ReportKey.metric_name: row[ReportKey.metric_name],
417
+ ReportKey.score: row[ReportKey.score]
418
+ })
419
+
420
+ if not parsed_data:
421
+ logger.warning('No valid dataset keys found for parsing.')
422
+ return
423
+
424
+ parsed_df = pd.DataFrame(parsed_data)
425
+
426
+ # 1. Overall ranking (aggregate across all datasets and subsets)
427
+ overall_df = parsed_df.groupby([ReportKey.model_name,
428
+ ReportKey.metric_name])[ReportKey.score].mean().reset_index()
429
+ leaderboard_outputs.append(format_leaderboard(overall_df, '=== OVERALL LEADERBOARD ==='))
430
+
431
+ # 2. Dataset-level rankings
432
+ datasets = parsed_df['dataset_name'].unique()
433
+ for dataset in sorted(datasets):
434
+ dataset_df = parsed_df[parsed_df['dataset_name'] == dataset]
435
+ dataset_agg = dataset_df.groupby([ReportKey.model_name,
436
+ ReportKey.metric_name])[ReportKey.score].mean().reset_index()
437
+ leaderboard_outputs.append(format_leaderboard(dataset_agg, f'=== DATASET LEADERBOARD: {dataset} ==='))
438
+
439
+ # 3. Subset-level rankings
440
+ subsets = parsed_df[['dataset_name', 'subset_name']].drop_duplicates()
441
+ for _, subset_row in subsets.iterrows():
442
+ dataset_name = subset_row['dataset_name']
443
+ subset_name = subset_row['subset_name']
444
+ subset_df = parsed_df[(parsed_df['dataset_name'] == dataset_name)
445
+ & (parsed_df['subset_name'] == subset_name)]
446
+ leaderboard_outputs.append(
447
+ format_leaderboard(subset_df, f'=== SUBSET LEADERBOARD: {dataset_name} - {subset_name} ===')
448
+ )
449
+
450
+ # Write all leaderboard outputs to file
451
+ with open(leaderboard_file, 'w', encoding='utf-8') as f:
452
+ f.write('\n'.join(leaderboard_outputs))
453
+
454
+ logger.info(f'Leaderboard results saved to: {leaderboard_file}')
@@ -0,0 +1,223 @@
1
+ import inspect
2
+ import math
3
+ import numpy as np
4
+ import pandas as pd
5
+ import re
6
+ from collections import defaultdict
7
+ from sklearn.linear_model import LogisticRegression
8
+ from tqdm import tqdm
9
+
10
+ from evalscope.api.evaluator import ReviewResult
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+
16
+ def process_review_item(review_result: ReviewResult) -> list:
17
+ """
18
+ Process a ReviewResult object to extract relevant information.
19
+
20
+ Args:
21
+ review_result: ReviewResult object or dict (for backward compatibility)
22
+
23
+ Returns:
24
+ list: List of processed review items with necessary information.
25
+ """
26
+
27
+ # New format using ReviewResult
28
+ sample_score = review_result.sample_score
29
+ prediction = sample_score.score.prediction
30
+ target = review_result.target
31
+ extracted_prediction = sample_score.score.extracted_prediction
32
+
33
+ raw_d = {
34
+ 'Index': str(review_result.index),
35
+ 'Input': review_result.input,
36
+ 'Question': review_result.input, # Use input as question
37
+ 'Generated':
38
+ prediction if prediction != extracted_prediction else extracted_prediction or '', # Ensure no None value
39
+ 'Gold': target,
40
+ 'Pred': extracted_prediction,
41
+ 'Score': sample_score.score.model_dump(exclude_none=True),
42
+ }
43
+ return [raw_d]
44
+
45
+
46
+ def post_process_result(completion):
47
+ result = re.findall(r'\[\[([AB<>=]+)\]\]', completion)
48
+ if result:
49
+ return result[0]
50
+ else:
51
+ return None
52
+
53
+
54
+ def get_judge_score(result, reverse=False):
55
+ """
56
+ Calculate the judge score, considering confidence weight.
57
+
58
+ Args:
59
+ result: Judgment result ('A=B', 'A>B', 'A>>B', 'B>A', 'B>>A')
60
+ reverse: Whether to reverse the score
61
+
62
+ Returns:
63
+ float: Weighted score
64
+ """
65
+
66
+ # Base score mapping - using finer-grained scores
67
+ if not reverse:
68
+ score_mapping = {
69
+ 'A=B': 0.5, # Tie
70
+ 'A>B': 0.75, # A slightly wins
71
+ 'A>>B': 1.0, # A significantly wins
72
+ 'B>A': 0.25, # B slightly wins
73
+ 'B>>A': 0.0, # B significantly wins
74
+ }
75
+ else:
76
+ score_mapping = {
77
+ 'A=B': 0.5, # Tie
78
+ 'A>B': 0.25, # A slightly wins
79
+ 'A>>B': 0.0, # A significantly wins
80
+ 'B>A': 0.75, # B slightly wins
81
+ 'B>>A': 1.0, # B significantly wins
82
+ }
83
+
84
+ base_score = score_mapping.get(result, 0.5)
85
+
86
+ return base_score
87
+
88
+
89
+ def get_battles_from_row(row, first_game_only=False, multiplier=3):
90
+ results = []
91
+
92
+ game = row['games'][0]
93
+ output = {'model_a': game['model_a'], 'model_b': game['model_b']}
94
+
95
+ weight = 1
96
+ if game['judgment'] == 'A=B':
97
+ output['winner'] = 'tie'
98
+ elif game['judgment'] == 'A>B':
99
+ output['winner'] = 'model_a'
100
+ elif game['judgment'] == 'A>>B':
101
+ output['winner'] = 'model_a'
102
+ weight = multiplier
103
+ elif game['judgment'] == 'B>A':
104
+ output['winner'] = 'model_b'
105
+ elif game['judgment'] == 'B>>A':
106
+ output['winner'] = 'model_b'
107
+ weight = multiplier
108
+ else:
109
+ weight = 0
110
+
111
+ if weight:
112
+ results += [output] * weight
113
+
114
+ if first_game_only:
115
+ return pd.DataFrame(results)
116
+
117
+ # Dont change the order of model_a and model_b
118
+ output = {'model_a': game['model_a'], 'model_b': game['model_b']}
119
+
120
+ # game 2
121
+ game = row['games'][1]
122
+
123
+ weight = 1
124
+ if game['judgment'] == 'A=B':
125
+ output['winner'] = 'tie'
126
+ elif game['judgment'] == 'A>B':
127
+ output['winner'] = 'model_b'
128
+ elif game['judgment'] == 'A>>B':
129
+ output['winner'] = 'model_b'
130
+ weight = multiplier
131
+ elif game['judgment'] == 'B>A':
132
+ output['winner'] = 'model_a'
133
+ elif game['judgment'] == 'B>>A':
134
+ output['winner'] = 'model_a'
135
+ weight = multiplier
136
+ else:
137
+ weight = 0
138
+
139
+ if weight:
140
+ results += [output] * weight
141
+
142
+ return pd.DataFrame(results)
143
+
144
+
145
+ def compute_mle_elo(df, scale=400, base=10, init_rating=1000, baseline_model='gpt4-0314'):
146
+ models = pd.concat([df['model_a'], df['model_b']]).unique()
147
+ models = pd.Series(np.arange(len(models)), index=models)
148
+
149
+ # duplicate battles
150
+ df = pd.concat([df, df], ignore_index=True)
151
+ p = len(models.index)
152
+ n = df.shape[0]
153
+
154
+ X = np.zeros([n, p])
155
+ X[np.arange(n), models[df['model_a']]] = +math.log(base)
156
+ X[np.arange(n), models[df['model_b']]] = -math.log(base)
157
+
158
+ # one A win => two A win
159
+ Y = np.zeros(n)
160
+ Y[df['winner'] == 'model_a'] = 1.0
161
+
162
+ # one tie => one A win + one B win
163
+ # find tie + tie (both bad) index
164
+ tie_idx = (df['winner'] == 'tie') | (df['winner'] == 'tie (bothbad)')
165
+ tie_idx[len(tie_idx) // 2:] = False
166
+ Y[tie_idx] = 1.0
167
+
168
+ if len(np.unique(Y)) < 2:
169
+ logger.info('Warning: Only one class in the data')
170
+ elo_scores = pd.Series(init_rating, index=models.index)
171
+ if np.all(Y == 1.0):
172
+ elo_scores[df['model_a'].iloc[0]] += scale # Boost the winning model
173
+ elif np.all(Y == 0.0):
174
+ elo_scores[df['model_b'].iloc[0]] += scale # Boost the winning model
175
+ return elo_scores.sort_values(ascending=False)
176
+
177
+ lr = LogisticRegression(
178
+ fit_intercept=False, penalty=None, tol=1e-8
179
+ ) # May need to set a small value when not use GPT4 as judge model
180
+ lr.fit(X, Y)
181
+
182
+ elo_scores = scale * lr.coef_[0] + init_rating
183
+
184
+ # set anchor 1000
185
+ if baseline_model in models.index:
186
+ elo_scores += 1000 - elo_scores[models[baseline_model]]
187
+ return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
188
+
189
+
190
+ def get_bootstrap_result(battles, func_compute_elo, num_round, baseline_model='gpt-4-0314'):
191
+ rows = []
192
+ kwargs = {}
193
+ if 'baseline_model' in inspect.signature(func_compute_elo).parameters:
194
+ kwargs['baseline_model'] = baseline_model
195
+ for _ in tqdm(range(num_round), desc='bootstrap'):
196
+ res = func_compute_elo(battles.sample(frac=1.0, replace=True), **kwargs)
197
+ if res is not None:
198
+ rows.append(res)
199
+ df = pd.DataFrame(rows)
200
+ return df[df.median().sort_values(ascending=False).index]
201
+
202
+
203
+ def predict_win_rate(elo_ratings, scale=400, base=10, init_rating=1000):
204
+ names = sorted(list(elo_ratings.keys()))
205
+ wins = defaultdict(lambda: defaultdict(lambda: 0))
206
+ for a in names:
207
+ for b in names:
208
+ ea = 1 / (1 + base**((elo_ratings[b] - elo_ratings[a]) / scale))
209
+ wins[a][b] = ea
210
+ wins[b][a] = 1 - ea
211
+
212
+ data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}
213
+
214
+ df = pd.DataFrame(data, index=names)
215
+ df.index.name = 'model_a'
216
+ df.columns.name = 'model_b'
217
+ return df.T
218
+
219
+
220
+ def get_win_rate_column(df, column, baseline='gpt4-0314'):
221
+ to_dict = df[['model', column]].set_index('model').to_dict()[column]
222
+ win_rate_table = predict_win_rate(to_dict)
223
+ return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x, 4))
File without changes