evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,14 +1,18 @@
1
1
  import os
2
2
  import torch
3
3
  from langchain_core.embeddings import Embeddings
4
+ from langchain_openai.embeddings import OpenAIEmbeddings
5
+ from mteb.encoder_interface import PromptType
4
6
  from sentence_transformers import models
5
7
  from sentence_transformers.cross_encoder import CrossEncoder
6
8
  from sentence_transformers.SentenceTransformer import SentenceTransformer
7
9
  from torch import Tensor
10
+ from tqdm import tqdm
8
11
  from typing import Dict, List, Optional, Union
9
12
 
10
13
  from evalscope.backend.rag_eval.utils.tools import download_model
11
14
  from evalscope.constants import HubType
15
+ from evalscope.utils.argument_utils import get_supported_params
12
16
  from evalscope.utils.logger import get_logger
13
17
 
14
18
  logger = get_logger()
@@ -18,16 +22,16 @@ class BaseModel(Embeddings):
18
22
 
19
23
  def __init__(
20
24
  self,
21
- model_name_or_path: str,
25
+ model_name_or_path: str = '',
22
26
  max_seq_length: int = 512,
23
- prompt: str = '',
24
- revision: Optional[str] = None,
27
+ prompt: Optional[str] = None,
28
+ prompts: Optional[Dict[str, str]] = None,
29
+ revision: Optional[str] = 'master',
25
30
  **kwargs,
26
31
  ):
27
32
  self.model_name_or_path = model_name_or_path
28
33
  self.max_seq_length = max_seq_length
29
34
  self.model_kwargs = kwargs.pop('model_kwargs', {})
30
- self.model_kwargs['trust_remote_code'] = True
31
35
 
32
36
  self.config_kwargs = kwargs.pop('config_kwargs', {})
33
37
  self.config_kwargs['trust_remote_code'] = True
@@ -36,7 +40,9 @@ class BaseModel(Embeddings):
36
40
  self.encode_kwargs['convert_to_tensor'] = True
37
41
 
38
42
  self.prompt = prompt
43
+ self.prompts = prompts if prompts else {}
39
44
  self.revision = revision
45
+ self.framework = ['PyTorch']
40
46
 
41
47
  @property
42
48
  def mteb_model_meta(self):
@@ -44,10 +50,22 @@ class BaseModel(Embeddings):
44
50
  from mteb import ModelMeta
45
51
 
46
52
  return ModelMeta(
47
- name=os.path.basename(self.model_name_or_path),
53
+ name='eval/' + os.path.basename(self.model_name_or_path), # Ensure the name contains a slash
48
54
  revision=self.revision,
49
55
  languages=None,
50
56
  release_date=None,
57
+ n_parameters=None,
58
+ memory_usage_mb=None,
59
+ max_tokens=None,
60
+ embed_dim=None,
61
+ license=None,
62
+ open_weights=None,
63
+ public_training_code=None,
64
+ public_training_data=None,
65
+ similarity_fn_name=None,
66
+ use_instructions=None,
67
+ training_datasets=None,
68
+ framework=self.framework,
51
69
  )
52
70
 
53
71
  def embed_documents(self, texts: List[str]) -> List[List[float]]:
@@ -59,7 +77,7 @@ class BaseModel(Embeddings):
59
77
  Returns:
60
78
  List of embeddings.
61
79
  """
62
- return self.encode_corpus(texts).tolist()
80
+ return self.encode(texts).tolist()
63
81
 
64
82
  def embed_query(self, text: str) -> List[float]:
65
83
  """Embed query text. Compact langchain.
@@ -70,19 +88,17 @@ class BaseModel(Embeddings):
70
88
  Returns:
71
89
  Embedding.
72
90
  """
73
- return self.encode_queries(text).tolist()
91
+ return self.encode(text).tolist()
74
92
 
75
93
  def encode(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
76
94
  """Embed text."""
77
95
  raise NotImplementedError
78
96
 
79
- def encode_queries(self, queries: List[str], **kwargs) -> list[torch.Tensor]:
80
- """Embed query text. Compact mteb."""
81
- raise NotImplementedError
82
-
83
- def encode_corpus(self, corpus: Union[List[str], List[Dict[str, str]]], **kwargs) -> list[torch.Tensor]:
84
- """Embed search docs . Compact mteb."""
85
- raise NotImplementedError
97
+ def get_prompt(self, task_name: str) -> Optional[str]:
98
+ """Get prompt for the given task name."""
99
+ if self.prompt:
100
+ return self.prompt
101
+ return self.prompts.get(task_name, None)
86
102
 
87
103
 
88
104
  class SentenceTransformerModel(BaseModel):
@@ -90,6 +106,9 @@ class SentenceTransformerModel(BaseModel):
90
106
  def __init__(self, model_name_or_path: str, pooling_mode: Optional[str] = None, **kwargs):
91
107
  super().__init__(model_name_or_path, **kwargs)
92
108
 
109
+ self.framework = ['Sentence Transformers', 'PyTorch']
110
+
111
+ self.model_kwargs['trust_remote_code'] = True
93
112
  if not pooling_mode:
94
113
  self.model = SentenceTransformer(
95
114
  self.model_name_or_path,
@@ -110,43 +129,59 @@ class SentenceTransformerModel(BaseModel):
110
129
 
111
130
  self.model.max_seq_length = self.max_seq_length
112
131
 
113
- def encode(self, texts: Union[str, List[str]], prompt=None, **kwargs) -> List[torch.Tensor]:
114
- kwargs.pop('prompt_name', '') # remove prompt name, use prompt
132
+ self.supported_encode_params = get_supported_params(self.model.encode)
133
+
134
+ def encode(self, texts: Union[str, List[str]], **kwargs) -> List[torch.Tensor]:
135
+ # pop unused kwargs
136
+ extra_params = {}
137
+ for key in list(kwargs.keys()):
138
+ if key not in self.supported_encode_params:
139
+ extra_params[key] = kwargs.pop(key)
115
140
  self.encode_kwargs.update(kwargs)
116
141
 
142
+ # set prompt if provided
143
+ prompt = None
144
+ prompt_type = extra_params.pop('prompt_type', '')
145
+ task_name = extra_params.pop('task_name', '')
146
+ if prompt_type and prompt_type == PromptType.query:
147
+ prompt = self.get_prompt(task_name)
148
+
117
149
  embeddings = self.model.encode(texts, prompt=prompt, **self.encode_kwargs)
118
150
  assert isinstance(embeddings, Tensor)
119
151
  return embeddings.cpu().detach()
120
152
 
121
- def encode_queries(self, queries, **kwargs):
122
- return self.encode(queries, prompt=self.prompt)
123
-
124
- def encode_corpus(self, corpus, **kwargs):
125
- if isinstance(corpus[0], dict):
126
- input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
127
- else:
128
- input_texts = corpus
129
- return self.encode(input_texts)
130
-
131
153
 
132
154
  class CrossEncoderModel(BaseModel):
133
155
 
134
156
  def __init__(self, model_name_or_path: str, **kwargs):
135
157
  super().__init__(model_name_or_path, **kwargs)
158
+
159
+ self.framework = ['Sentence Transformers', 'PyTorch']
160
+
136
161
  self.model = CrossEncoder(
137
162
  self.model_name_or_path,
138
163
  trust_remote_code=True,
139
164
  max_length=self.max_seq_length,
165
+ automodel_args=self.model_kwargs,
140
166
  )
141
-
142
- def predict(self, sentences: List[List[str]], **kwargs) -> List[List[float]]:
167
+ self.tokenizer = self.model.tokenizer
168
+ # set pad token
169
+ if self.tokenizer.pad_token is None:
170
+ self.tokenizer.pad_token = self.tokenizer.eos_token
171
+ if ('pad_token_id' not in self.model.config) or (self.model.config.pad_token_id is None):
172
+ self.model.config.update({'pad_token_id': self.tokenizer.eos_token_id})
173
+
174
+ self.supported_encode_params = get_supported_params(self.model.predict)
175
+
176
+ def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
177
+ for key in list(kwargs.keys()):
178
+ if key not in self.supported_encode_params:
179
+ kwargs.pop(key)
143
180
  self.encode_kwargs.update(kwargs)
144
181
 
145
- if len(sentences[0]) == 3: # Note: For mteb retrieval task
182
+ if len(sentences[0]) == 2: # Note: For mteb retrieval task
146
183
  processed_sentences = []
147
- for query, docs, instruction in sentences:
148
- if isinstance(docs, dict):
149
- docs = docs['text']
184
+ for query, docs in sentences:
150
185
  processed_sentences.append((self.prompt + query, docs))
151
186
  sentences = processed_sentences
152
187
  embeddings = self.model.predict(sentences, **self.encode_kwargs)
@@ -154,6 +189,60 @@ class CrossEncoderModel(BaseModel):
154
189
  return embeddings
155
190
 
156
191
 
192
+ class APIEmbeddingModel(BaseModel):
193
+
194
+ def __init__(self, **kwargs):
195
+ self.model_name = kwargs.get('model_name')
196
+ self.openai_api_base = kwargs.get('api_base')
197
+ self.openai_api_key = kwargs.get('api_key')
198
+ self.dimensions = kwargs.get('dimensions')
199
+ self.check_embedding_ctx_length = kwargs.get('check_embedding_ctx_length', False)
200
+ self.framework = ['API']
201
+
202
+ self.model = OpenAIEmbeddings(
203
+ model=self.model_name,
204
+ openai_api_base=self.openai_api_base,
205
+ openai_api_key=self.openai_api_key,
206
+ dimensions=self.dimensions,
207
+ check_embedding_ctx_length=self.check_embedding_ctx_length,
208
+ )
209
+
210
+ super().__init__(model_name_or_path=self.model_name, **kwargs)
211
+
212
+ self.batch_size = self.encode_kwargs.get('batch_size', 10)
213
+
214
+ self.supported_encode_params = get_supported_params(self.model.embed_documents)
215
+
216
+ def encode(self, texts: Union[str, List[str]], **kwargs) -> Tensor:
217
+ # pop unused kwargs
218
+ extra_params = {}
219
+ for key in list(kwargs.keys()):
220
+ if key not in self.supported_encode_params:
221
+ extra_params[key] = kwargs.pop(key)
222
+ self.encode_kwargs.update(kwargs)
223
+
224
+ # set prompt if provided
225
+ prompt = None
226
+ prompt_type = extra_params.pop('prompt_type', '')
227
+ task_name = extra_params.pop('task_name', '')
228
+ if prompt_type and prompt_type == PromptType.query:
229
+ prompt = self.get_prompt(task_name)
230
+
231
+ if isinstance(texts, str):
232
+ texts = [texts]
233
+
234
+ embeddings: List[List[float]] = []
235
+ for i in tqdm(range(0, len(texts), self.batch_size)):
236
+ # set prompt if provided
237
+ if prompt is not None:
238
+ batch_texts = [prompt + text for text in texts[i:i + self.batch_size]]
239
+ else:
240
+ batch_texts = texts[i:i + self.batch_size]
241
+ response = self.model.embed_documents(batch_texts, chunk_size=self.batch_size)
242
+ embeddings.extend(response)
243
+ return torch.tensor(embeddings)
244
+
245
+
157
246
  class EmbeddingModel:
158
247
  """Custom embeddings"""
159
248
 
@@ -165,6 +254,10 @@ class EmbeddingModel:
165
254
  revision: Optional[str] = 'master',
166
255
  **kwargs,
167
256
  ):
257
+ if kwargs.get('model_name'):
258
+ # If model_name is provided, use OpenAIEmbeddings
259
+ return APIEmbeddingModel(**kwargs)
260
+
168
261
  # If model path does not exist and hub is 'modelscope', download the model
169
262
  if not os.path.exists(model_name_or_path) and hub == HubType.MODELSCOPE:
170
263
  model_name_or_path = download_model(model_name_or_path, revision)
@@ -2,11 +2,10 @@ import os
2
2
  from langchain_core.callbacks.manager import CallbackManagerForLLMRun
3
3
  from langchain_core.language_models.llms import LLM as BaseLLM
4
4
  from langchain_openai import ChatOpenAI
5
- from modelscope.utils.hf_util import GenerationConfig
6
5
  from typing import Any, Dict, Iterator, List, Mapping, Optional
7
6
 
8
- from evalscope.constants import DEFAULT_MODEL_REVISION
9
- from evalscope.models import ChatGenerationModelAdapter
7
+ from evalscope.api.model import GenerateConfig, Model, get_model
8
+ from evalscope.constants import DEFAULT_MODEL_REVISION, EvalType
10
9
 
11
10
 
12
11
  class LLM:
@@ -16,9 +15,9 @@ class LLM:
16
15
  api_base = kw.get('api_base', None)
17
16
  if api_base:
18
17
  return ChatOpenAI(
19
- model_name=kw.get('model_name', ''),
20
- openai_api_base=api_base,
21
- openai_api_key=kw.get('api_key', 'EMPTY'),
18
+ model=kw.get('model_name', ''),
19
+ base_url=api_base,
20
+ api_key=kw.get('api_key', 'EMPTY'),
22
21
  )
23
22
  else:
24
23
  return LocalLLM(**kw)
@@ -30,17 +29,19 @@ class LocalLLM(BaseLLM):
30
29
  model_name_or_path: str
31
30
  model_revision: str = DEFAULT_MODEL_REVISION
32
31
  template_type: Optional[str] = None
33
- model_name: Optional[str]
34
- model: Optional[ChatGenerationModelAdapter]
35
- generation_config: Optional[Dict]
32
+ model_name: Optional[str] = None
33
+ model: Optional[Model] = None
34
+ generation_config: Optional[Dict] = {}
36
35
 
37
36
  def __init__(self, **kw):
38
37
  super().__init__(**kw)
39
38
  self.model_name = os.path.basename(self.model_name_or_path)
40
- self.model = ChatGenerationModelAdapter(
41
- model_id=self.model_name_or_path,
42
- model_revision=self.model_revision,
43
- generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
39
+
40
+ # Create and initialize the local model
41
+ self.model = get_model(
42
+ model=self.model_name_or_path,
43
+ eval_type=EvalType.CHECKPOINT,
44
+ config=GenerateConfig(**self.generation_config),
44
45
  )
45
46
 
46
47
  def _call(
@@ -51,10 +52,9 @@ class LocalLLM(BaseLLM):
51
52
  **kwargs: Any,
52
53
  ) -> str:
53
54
  """Run the LLM on the given input."""
54
- infer_cfg = {'stop': stop}
55
55
 
56
- response = self.model._model_generate(prompt, infer_cfg)
57
- return response
56
+ response = self.model.generate(input=prompt)
57
+ return response.completion
58
58
 
59
59
  @property
60
60
  def _identifying_params(self) -> Dict[str, Any]:
@@ -1,10 +1,12 @@
1
1
  import copy
2
+ import os
2
3
  import subprocess
3
4
  from functools import partial
4
5
  from typing import Optional, Union
5
6
 
6
7
  from evalscope.backend.base import BackendManager
7
- from evalscope.utils import get_valid_list, is_module_installed
8
+ from evalscope.utils.import_utils import is_module_installed
9
+ from evalscope.utils.io_utils import get_valid_list
8
10
  from evalscope.utils.logger import get_logger
9
11
 
10
12
  logger = get_logger()
@@ -66,8 +68,11 @@ class VLMEvalKitBackendManager(BackendManager):
66
68
  del remain_cfg['name'] # remove not used args
67
69
  del remain_cfg['type'] # remove not used args
68
70
 
69
- self.valid_models.update({model_type: partial(model_class, model=model_type, **remain_cfg)})
70
- new_model_names.append(model_type)
71
+ norm_model_type = os.path.basename(model_type).replace(':', '-').replace('.', '_')
72
+ model_cfg['type'] = norm_model_type
73
+
74
+ self.valid_models.update({norm_model_type: partial(model_class, model=model_type, **remain_cfg)})
75
+ new_model_names.append(norm_model_type)
71
76
  else:
72
77
  remain_cfg = copy.deepcopy(model_cfg)
73
78
  del remain_cfg['name'] # remove not used args
@@ -2,16 +2,17 @@
2
2
  import glob
3
3
  import importlib
4
4
  import os
5
+ import time
5
6
 
6
- from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
7
- from evalscope.benchmarks.data_adapter import DataAdapter
8
7
  from evalscope.utils import get_logger
9
8
 
10
9
  logger = get_logger()
11
10
 
12
11
  # Using glob to find all files matching the pattern
13
- pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
14
- files = glob.glob(pattern, recursive=False)
12
+ pattern = os.path.join(os.path.dirname(__file__), '*', '**', '*_adapter.py')
13
+ files = glob.glob(pattern, recursive=True)
14
+
15
+ import_times = []
15
16
 
16
17
  for file_path in files:
17
18
  if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
@@ -19,5 +20,16 @@ for file_path in files:
19
20
  relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
20
21
  module_path = relative_path[:-3].replace(os.path.sep, '.') # strip '.py' and convert to module path
21
22
  full_path = f'evalscope.benchmarks.{module_path}'
23
+
24
+ start_time = time.perf_counter()
22
25
  importlib.import_module(full_path)
23
- # print(f'Importing {full_path}')
26
+ end_time = time.perf_counter()
27
+
28
+ import_times.append((full_path, end_time - start_time))
29
+
30
+ # Sort by import time in descending order
31
+ import_times.sort(key=lambda x: x[1], reverse=True)
32
+
33
+ # Log the sorted import times
34
+ for module, duration in import_times:
35
+ logger.debug(f'Module {module} imported in {duration:.6f} seconds')
File without changes
@@ -0,0 +1,205 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # flake8: noqa: E501
3
+ import re
4
+ import urllib.request
5
+ import zipfile
6
+ from pathlib import Path
7
+ from typing import Any, Dict
8
+
9
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
10
+ from evalscope.api.dataset import Sample
11
+ from evalscope.api.evaluator import TaskState
12
+ from evalscope.api.messages import ChatMessageUser
13
+ from evalscope.api.metric import Score
14
+ from evalscope.api.registry import register_benchmark
15
+ from evalscope.constants import DEFAULT_EVALSCOPE_CACHE_DIR, Tags
16
+ from evalscope.utils.logger import get_logger
17
+
18
+ logger = get_logger()
19
+
20
+ # Default judge prompt template
21
+ JUDGE_PROMPT = """Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT. For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER.
22
+
23
+ The question, for reference only: {question}
24
+ The OFFICIAL ANSWER: {correct_answer}
25
+ CANDIDATE ANSWER TO ASSESS: {response}
26
+
27
+ Reply only with CORRECT or INCORRECT."""
28
+
29
+ PROMPT_TEMPLATE = """
30
+ BEGIN INPUT DOCUMENTS
31
+
32
+ {documents_text}
33
+
34
+ END INPUT DOCUMENTS
35
+
36
+ Answer the following question using the input documents provided above.
37
+
38
+ START QUESTION
39
+
40
+ {question}
41
+
42
+ END QUESTION
43
+ """
44
+
45
+ # New constants for auto-download
46
+ DOWNLOAD_URL: str = (
47
+ 'https://modelscope.cn/datasets/evalscope/AA-LCR/resolve/master/extracted_text/AA-LCR_extracted-text.zip'
48
+ )
49
+ DEFAULT_CACHE_SUBDIR: str = 'aa_lcr'
50
+ DEFAULT_ZIP_NAME: str = 'AA-LCR_extracted-text.zip'
51
+ DEFAULT_EXTRACTED_DIR_NAME: str = 'lcr'
52
+
53
+
54
+ @register_benchmark(
55
+ BenchmarkMeta(
56
+ name='aa_lcr',
57
+ pretty_name='AA-LCR',
58
+ tags=[Tags.KNOWLEDGE, Tags.REASONING, Tags.LONG_CONTEXT],
59
+ description='AA-LCR (Artificial Analysis Long Context Retrieval) is a benchmark for evaluating long-context '
60
+ 'retrieval and reasoning capabilities of language models across multiple documents.', # noqa: E501
61
+ dataset_id='evalscope/AA-LCR',
62
+ metric_list=['acc'],
63
+ few_shot_num=0,
64
+ train_split=None,
65
+ eval_split='test',
66
+ prompt_template=PROMPT_TEMPLATE,
67
+ extra_params={'text_dir': None}
68
+ )
69
+ )
70
+ class AALCRAdapter(DefaultDataAdapter):
71
+
72
+ def __init__(self, *args, **kwargs):
73
+ super().__init__(*args, **kwargs)
74
+
75
+ self._use_llm_judge = True
76
+
77
+ # Get extra parameters
78
+ self.text_dir = self.extra_params.get('text_dir')
79
+
80
+ def load(self):
81
+ # Auto download and extract when text_dir is not provided
82
+ if not self.text_dir:
83
+ self.text_dir = self._ensure_text_dir_downloaded()
84
+ elif not Path(self.text_dir).exists():
85
+ raise ValueError(
86
+ 'AA-LCR text_dir does not exist: '
87
+ f'{self.text_dir}. Please provide a valid directory or omit text_dir to auto-download.'
88
+ )
89
+
90
+ self.text_dir = Path(self.text_dir)
91
+ return super().load()
92
+
93
+ def _ensure_text_dir_downloaded(self) -> Path:
94
+ """Ensure AA-LCR extracted texts are available locally; download and extract if missing."""
95
+ cache_root = Path(DEFAULT_EVALSCOPE_CACHE_DIR) / DEFAULT_CACHE_SUBDIR
96
+ extracted_dir = cache_root / DEFAULT_EXTRACTED_DIR_NAME
97
+
98
+ if extracted_dir.exists():
99
+ logger.info(f'AA-LCR documents found: {extracted_dir}')
100
+ return extracted_dir
101
+
102
+ cache_root.mkdir(parents=True, exist_ok=True)
103
+ zip_path = cache_root / DEFAULT_ZIP_NAME
104
+
105
+ try:
106
+ logger.info(f'Downloading AA-LCR documents from {DOWNLOAD_URL} to {zip_path}...')
107
+ urllib.request.urlretrieve(DOWNLOAD_URL, zip_path)
108
+
109
+ logger.info(f'Extracting {zip_path} to {cache_root}...')
110
+ with zipfile.ZipFile(zip_path, 'r') as zf:
111
+ zf.extractall(cache_root)
112
+
113
+ if not extracted_dir.exists():
114
+ raise ValueError(f'Extraction succeeded but target directory not found: {extracted_dir}')
115
+
116
+ logger.info(f'AA-LCR documents ready at {extracted_dir}')
117
+ return extracted_dir
118
+ except Exception as e:
119
+ raise ValueError(
120
+ f'Failed to download or extract AA-LCR documents: {e}. '
121
+ 'You can also manually download and set extra_params["text_dir"].'
122
+ ) from e
123
+ finally:
124
+ # Best-effort cleanup of the zip file
125
+ try:
126
+ if zip_path.exists():
127
+ zip_path.unlink()
128
+ except Exception:
129
+ pass
130
+
131
+ def _get_context(self, record: Dict[str, Any]) -> str:
132
+ doc_folder = self.text_dir / record['document_category'] / record['document_set_id']
133
+
134
+ # Check if the document folder exists
135
+ if not doc_folder.exists() or not doc_folder.is_dir():
136
+ logger.warning(f'Document folder not found: {doc_folder}. Returning empty context.')
137
+ return ''
138
+
139
+ doc_blocks = []
140
+ try:
141
+ for file_path in doc_folder.iterdir():
142
+ if file_path.is_file():
143
+ try:
144
+ content = file_path.read_text(encoding='utf-8').strip()
145
+ if content:
146
+ doc_blocks.append(content)
147
+ except (IOError, UnicodeDecodeError) as e:
148
+ logger.warning(f'Could not read file {file_path}, skipping: {e}')
149
+ except OSError as e:
150
+ logger.warning(f'Could not access document folder {doc_folder}: {e}')
151
+ return f"ERROR: Could not read documents for {record['document_category']}/{record['document_set_id']}"
152
+
153
+ documents_text = '\n\n'.join(
154
+ f'BEGIN DOCUMENT {i + 1}:\n{doc}\nEND DOCUMENT {i + 1}' for i, doc in enumerate(doc_blocks)
155
+ )
156
+ return documents_text
157
+
158
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
159
+ """Convert a record to a Sample with long-context prompt."""
160
+ context = self._get_context(record)
161
+ prompt = self.prompt_template.format(documents_text=context, question=record['question'])
162
+
163
+ return Sample(
164
+ input=[ChatMessageUser(content=prompt)],
165
+ target=record['answer'],
166
+ metadata={
167
+ 'question': record['question'],
168
+ 'data_source_urls': record['data_source_urls'],
169
+ 'input_tokens': record.get('input_tokens', 0),
170
+ }
171
+ )
172
+
173
+ def llm_match_score(
174
+ self,
175
+ original_prediction: str,
176
+ filtered_prediction: str,
177
+ reference: str,
178
+ task_state: TaskState,
179
+ ) -> Score:
180
+ score = Score(
181
+ extracted_prediction=filtered_prediction,
182
+ prediction=original_prediction,
183
+ )
184
+
185
+ judge_prompt = JUDGE_PROMPT.format(
186
+ question=task_state.metadata['question'], correct_answer=reference, response=filtered_prediction
187
+ )
188
+
189
+ # Request judge and obtain score
190
+ judge_response = self.llm_judge.judge(prompt=judge_prompt)
191
+
192
+ # Parse judge response to get accuracy score
193
+ # Use word boundaries to avoid matching "CORRECT" within "INCORRECT"
194
+ is_correct = bool(re.search(r'\bCORRECT\b', judge_response, re.IGNORECASE))
195
+ score.value = {
196
+ 'acc': 1.0 if is_correct else 0.0,
197
+ }
198
+ score.explanation = f'LLM judge: {judge_response}'
199
+ score.metadata = {
200
+ 'source': 'llm_judge',
201
+ 'judge_strategy': self.judge_strategy,
202
+ 'model': self.llm_judge.model_id,
203
+ }
204
+ score.main_score_name = 'acc'
205
+ return score
File without changes
@@ -0,0 +1,54 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.evaluator import TaskState
6
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.io_utils import bytes_to_base64
10
+ from evalscope.utils.logger import get_logger
11
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
12
+
13
+ logger = get_logger()
14
+
15
+ SUBSET_LIST = ['default']
16
+
17
+ MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
18
+
19
+
20
+ @register_benchmark(
21
+ BenchmarkMeta(
22
+ name='ai2d',
23
+ pretty_name='AI2D',
24
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
25
+ description=
26
+ 'AI2D is a benchmark dataset for researching the understanding of diagrams by AI. It contains over 5,000 diverse diagrams from science textbooks (e.g., the water cycle, food webs). Each diagram is accompanied by multiple-choice questions that test an AI\'s ability to interpret visual elements, text labels, and their relationships. The benchmark is challenging because it requires jointly understanding the layout, symbols, and text to answer questions correctly.', # noqa: E501
27
+ dataset_id='lmms-lab/ai2d',
28
+ subset_list=SUBSET_LIST,
29
+ metric_list=['acc'],
30
+ eval_split='test',
31
+ prompt_template=MULT_CHOICE_PROMPT,
32
+ )
33
+ )
34
+ class Ai2dAdapter(VisionLanguageAdapter):
35
+
36
+ def __init__(self, **kwargs):
37
+ super().__init__(**kwargs)
38
+
39
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
40
+ answers_list: list[str] = record['options']
41
+ input_text = prompt(question=record['question'], choices=answers_list, template=self.prompt_template)
42
+ content_list: list[Content] = [ContentText(text=input_text)]
43
+ image = record.get('image')
44
+ if image:
45
+ image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
46
+ content_list.append(ContentImage(image=image_base64))
47
+
48
+ label_answer = chr(int(record['answer']) + ord('A'))
49
+
50
+ return Sample(input=[ChatMessageUser(content=content_list)], choices=answers_list, target=label_answer)
51
+
52
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
53
+ answers = parse_answers(task_state)
54
+ return ''.join(sorted(list(answers)))
File without changes