evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,130 @@
1
+ import os
2
+
3
+ from evalscope import TaskConfig, run_task
4
+
5
+ DASHSCOPE_API_KEY = 'sk-723135c241x'
6
+
7
+ def eval_distill_qwen():
8
+ model_name = 'DeepSeek-R1-Distill-Qwen-7B'
9
+ dataset_name = 'math_500'
10
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
11
+
12
+ task_config = TaskConfig(
13
+ api_url='http://0.0.0.0:8801/v1/chat/completions',
14
+ model=model_name,
15
+ eval_type='service',
16
+ datasets=[dataset_name],
17
+ dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
18
+ eval_batch_size=32,
19
+ generation_config={
20
+ 'max_tokens': 20000, # avoid exceed max length
21
+ 'temperature': 0.6,
22
+ 'top_p': 0.95,
23
+ 'n': 1,
24
+ },
25
+ )
26
+ run_task(task_config)
27
+
28
+
29
+ def eval_math_qwen():
30
+ model_name = 'Qwen2.5-Math-7B-Instruct'
31
+ dataset_name = 'math_500'
32
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
33
+
34
+ task_config = TaskConfig(
35
+ api_url='http://0.0.0.0:8801/v1/chat/completions',
36
+ model=model_name,
37
+ eval_type='service',
38
+ datasets=[dataset_name],
39
+ dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
40
+ eval_batch_size=32,
41
+ generation_config={
42
+ 'max_tokens': 3000, # avoid exceed max length
43
+ 'temperature': 0.6,
44
+ 'top_p': 0.95,
45
+ 'n': 3,
46
+ },
47
+ )
48
+ run_task(task_config)
49
+
50
+ def eval_r1():
51
+ model_name = 'deepseek-r1'
52
+ dataset_name = 'math_500'
53
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
54
+
55
+ task_config = TaskConfig(
56
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
57
+ api_key=DASHSCOPE_API_KEY,
58
+ model=model_name,
59
+ eval_type='service',
60
+ datasets=[dataset_name],
61
+ dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
62
+ eval_batch_size=8,
63
+ generation_config={
64
+ 'max_tokens': 20000, # avoid exceed max length
65
+ 'temperature': 0.6,
66
+ 'top_p': 0.95,
67
+ 'n': 1,
68
+ },
69
+ use_cache='./outputs/20250307_000404',
70
+ timeout=36000,
71
+ stream=True
72
+ )
73
+ run_task(task_config)
74
+
75
+
76
+ def eval_distill_32b():
77
+ model_name = 'deepseek-r1-distill-qwen-32b'
78
+ dataset_name = 'math_500'
79
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
80
+
81
+ task_config = TaskConfig(
82
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
83
+ api_key=DASHSCOPE_API_KEY,
84
+ model=model_name,
85
+ eval_type='service',
86
+ datasets=[dataset_name],
87
+ dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
88
+ eval_batch_size=5,
89
+ generation_config={
90
+ 'max_tokens': 12000, # avoid exceed max length
91
+ 'temperature': 0.6,
92
+ 'top_p': 0.95,
93
+ 'n': 1,
94
+ },
95
+ use_cache='./outputs/20250306_235951',
96
+ timeout=32000,
97
+ stream=True
98
+
99
+ )
100
+ run_task(task_config)
101
+
102
+ def eval_qwq():
103
+ model_name = 'qwq-32b-preview'
104
+ dataset_name = 'math_500'
105
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
106
+
107
+ task_config = TaskConfig(
108
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
109
+ api_key=os.environ['DASHSCOPE_API_KEY'],
110
+ model=model_name,
111
+ eval_type='service',
112
+ datasets=[dataset_name],
113
+ dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
114
+ eval_batch_size=32,
115
+ generation_config={
116
+ 'max_tokens': 8000, # avoid exceed max length
117
+ 'temperature': 0.6,
118
+ 'top_p': 0.95,
119
+ 'n': 1,
120
+ },
121
+ use_cache='./outputs/20250221_105911'
122
+ )
123
+ run_task(task_config)
124
+
125
+ if __name__ == '__main__':
126
+ # eval_distill_qwen()
127
+ # eval_math_qwen()
128
+ eval_r1()
129
+ # eval_qwq()
130
+ # eval_distill_32b()
@@ -0,0 +1,17 @@
1
+ The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
2
+
3
+ [Math Problem]
4
+
5
+ {problem}
6
+
7
+ [Correct Answer]
8
+
9
+ {answer}
10
+
11
+ [Solution]
12
+
13
+ {tagged_response}
14
+
15
+ Your task is to review and critique the solution paragraph by paragraph. Once you identify an correct answer in a paragraph, return the index of the paragraph where the earliest correct answer occurs. Otherwise, return the index of -1 (which typically denotes "not found").
16
+
17
+ Please put your final answer (i.e., the index) in \boxed{{}}.
@@ -0,0 +1,31 @@
1
+ I will present you with a solution to a math problem. Unfortunately, the solution lacks proper paragraphing, making it hard to read. Your task is to improve readability by reformatting the solution into well-structured paragraphs. Follow these specific guidelines:
2
+
3
+ * Insert \n\n for paragraph breaks within the original solution. Do **NOT** alter any content of the original solution (the only exception is for itemized lists; see below).
4
+
5
+ - Each paragraph should represent a distinct, concise reasoning step that logically advances the solution.
6
+
7
+ - Reasoning steps can include case discussions, formula simplifications, or formula derivations. Each of these should be treated as an individual reasoning step and paragraphed accordingly.
8
+
9
+ - If an introductory analysis exists in the original solution, treat it as an initial reasoning step and place it as the first paragraph.
10
+
11
+ - Do **NOT** place any mathematical formulas in their own separate paragraphs; instead, include them within the same paragraph as the preceding text to form a cohesive reasoning step.
12
+
13
+ * For any itemized lists (ordered or unordered), convert them into a written format, such as "First/Second/Third." This is the **ONLY** content modification allowed.
14
+
15
+ * Avoid making paragraphs too lengthy, as long paragraphs might contain multiple reasoning steps that should be paragraphed separately.
16
+
17
+ * Disregard the accuracy of the solution content. Do **NOT** alter any of the original solution's content; focus solely on structuring it into logical, readable paragraphs.
18
+
19
+ * Reply with the reformatted solution directly.
20
+
21
+ --------------------------------------------------
22
+
23
+ Here is the math problem, and the solution that needs to be reformatted:
24
+
25
+ [Math Problem]
26
+
27
+ {problem}
28
+
29
+ [Solution]
30
+
31
+ {response}
File without changes
@@ -0,0 +1,48 @@
1
+ import os
2
+ from openai import OpenAI
3
+
4
+
5
+ def request_url(llm_config, content):
6
+ try:
7
+ client = OpenAI(
8
+ api_key=llm_config['api_key'],
9
+ base_url=llm_config['base_url'],
10
+ )
11
+ completion = client.chat.completions.create(
12
+ model=llm_config['model_name'],
13
+ messages=[{'role': 'user', 'content': content}]
14
+ )
15
+ return completion.choices[0].message.content
16
+ except Exception as e:
17
+ print(e)
18
+ return None
19
+
20
+ def request_qwen(content):
21
+ try:
22
+ client = OpenAI(
23
+ api_key=os.getenv('DASHSCOPE_API_KEY'),
24
+ base_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
25
+ )
26
+
27
+ completion = client.chat.completions.create(
28
+ model='qwen-max',
29
+ messages=[{'role': 'user', 'content': content}]
30
+ )
31
+ return completion.choices[0].message.content
32
+ except Exception as e:
33
+ print(e)
34
+
35
+
36
+ def request_local(content):
37
+ try:
38
+ client = OpenAI(
39
+ api_key='EMPTY',
40
+ base_url='http://0.0.0.0:8801/v1',
41
+ )
42
+ completion = client.chat.completions.create(
43
+ model='Qwen2.5-72B-Instruct',
44
+ messages=[{'role': 'user', 'content': content}]
45
+ )
46
+ return completion.choices[0].message.content
47
+ except Exception as e:
48
+ print(e)
@@ -0,0 +1,13 @@
1
+ import re
2
+
3
+
4
+ def extract_answer(solution_text: str):
5
+ boxed_pattern = r'\\boxed\{([^}]*)\}'
6
+ matches = re.findall(boxed_pattern, solution_text)
7
+ if matches:
8
+ last_boxed_content = matches[-1]
9
+ number_pattern = r'-?\d+'
10
+ number_matches = re.findall(number_pattern, last_boxed_content)
11
+ if number_matches:
12
+ return number_matches[-1].strip()
13
+ return None
@@ -1,37 +1,63 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
1
  from dataclasses import dataclass
3
- from swift.llm import get_default_template_type, get_model_tokenizer, get_template, inference
4
- from swift.utils import seed_everything
5
-
6
- # TODO: Support custom model for swift infer
7
2
 
8
3
 
9
4
  @dataclass
10
5
  class SwiftInferArgs:
11
6
  model_id_or_path: str
12
7
  model_type: str
8
+ infer_backend: str = 'vllm' # 可选 'pt', 'vllm', 'lmdeploy'
13
9
  max_new_tokens: int = 2048
14
-
10
+ temperature: float = 0.1
11
+ max_batch_size: int = 16
15
12
 
16
13
  class SwiftInfer:
17
14
 
18
15
  def __init__(self, args: SwiftInferArgs):
19
- model_type = args.model_type
20
- template_type = get_default_template_type(model_type)
21
- model, tokenizer = get_model_tokenizer(
22
- model_type, model_id_or_path=args.model_id_or_path, model_kwargs={'device_map': 'auto'})
23
- model.generation_config.max_new_tokens = args.max_new_tokens
24
- print(f'** Generation config: {model.generation_config}')
16
+ # infer backend模型初始化
17
+ if args.infer_backend == 'pt':
18
+ self.engine: InferEngine = PtEngine(args.model_id_or_path, max_batch_size=args.max_batch_size)
19
+ elif args.infer_backend == 'vllm':
20
+ from swift.llm import VllmEngine
21
+ self.engine: InferEngine = VllmEngine(args.model_id_or_path, max_model_len=8192)
22
+ elif args.infer_backend == 'lmdeploy':
23
+ from swift.llm import LmdeployEngine
24
+ self.engine: InferEngine = LmdeployEngine(args.model_id_or_path)
25
+ else:
26
+ raise ValueError(f'Unsupported infer_backend: {args.infer_backend}')
27
+
28
+ # 基本配置获取 (可选)
29
+ self.request_config = RequestConfig(
30
+ max_tokens=args.max_new_tokens,
31
+ temperature=args.temperature,
32
+ stream=False # 可以透传参数改为True进行流式推理
33
+ )
25
34
 
26
- template = get_template(template_type, tokenizer)
27
- seed_everything(42)
35
+ def predict(self, system: str, query: str, history: list):
36
+ # Swift 3.0标准接口中,消息传入的格式是:
37
+ # messages: [{"role": "system", "content": "<SYSTEM_PROMPT>"},
38
+ # {"role": "user", "content": "用户问题内容"},
39
+ # {"role": "assistant", "content": "助手回答内容"}, ...]
28
40
 
29
- self.tokenizer = tokenizer
30
- self.model = model
31
- self.template = template
41
+ messages = []
42
+ if system.strip():
43
+ messages.append({'role': 'system', 'content': system})
32
44
 
33
- def predict(self, system: str, query: str, history: list):
45
+ # 将历史对话拼接进message中
46
+ for qa_pair in history:
47
+ # 假定 history 中每个元素形如 ("user input", "model response"),请根据你的数据格式进行调整。
48
+ user_answer, model_response = qa_pair
49
+ messages.append({'role': 'user', 'content': user_answer})
50
+ messages.append({'role': 'assistant', 'content': model_response})
51
+
52
+ # 添加本次用户问题
53
+ messages.append({'role': 'user', 'content': query})
54
+
55
+ infer_request = InferRequest(messages=messages)
56
+
57
+ # 进行推理
58
+ response = self.engine.infer([infer_request], self.request_config)
34
59
 
35
- response, history = inference(self.model, self.template, query=query, system=system, history=history)
60
+ # 提取模型返回的文本结果(假设非stream模式)
61
+ result_text = response[0].choices[0].message.content.strip()
36
62
 
37
- return response
63
+ return result_text
@@ -6,11 +6,12 @@ from typing import Union
6
6
  from evalscope.third_party.toolbench_static.eval import EvalArgs, run_eval
7
7
  from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
8
8
  from evalscope.utils import get_logger
9
+ from evalscope.utils.deprecation_utils import deprecated
9
10
  from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
10
11
 
11
12
  logger = get_logger()
12
13
 
13
-
14
+ @deprecated(since='0.15.1', remove_in='0.18.0', alternative='Native implementation of ToolBench')
14
15
  def run_task(task_cfg: Union[str, dict]):
15
16
 
16
17
  if isinstance(task_cfg, str):
@@ -1,4 +1,84 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.utils.model_utils import EvalBackend
4
- from evalscope.utils.utils import *
3
+ from typing import TYPE_CHECKING
4
+
5
+ from .import_utils import _LazyModule
6
+
7
+ if TYPE_CHECKING:
8
+ from .argument_utils import BaseArgument, get_supported_params, parse_int_or_float
9
+ from .deprecation_utils import deprecated
10
+ from .function_utils import run_once, thread_safe
11
+ from .import_utils import get_module_path, is_module_installed
12
+ from .io_utils import (
13
+ OutputsStructure,
14
+ csv_to_jsonl,
15
+ csv_to_list,
16
+ dict_to_yaml,
17
+ gen_hash,
18
+ get_latest_folder_path,
19
+ get_valid_list,
20
+ json_to_dict,
21
+ jsonl_to_csv,
22
+ jsonl_to_list,
23
+ safe_filename,
24
+ yaml_to_dict,
25
+ )
26
+ from .logger import configure_logging, get_logger
27
+ from .model_utils import EvalBackend, dict_torch_dtype_to_str, fix_do_sample_warning, get_device, seed_everything
28
+
29
+ else:
30
+ _import_structure = {
31
+ 'argument_utils': [
32
+ 'BaseArgument',
33
+ 'parse_int_or_float',
34
+ 'get_supported_params',
35
+ ],
36
+ 'model_utils': [
37
+ 'EvalBackend',
38
+ 'get_device',
39
+ 'seed_everything',
40
+ 'dict_torch_dtype_to_str',
41
+ 'fix_do_sample_warning',
42
+ ],
43
+ 'import_utils': [
44
+ 'is_module_installed',
45
+ 'get_module_path',
46
+ ],
47
+ 'function_utils': [
48
+ 'thread_safe',
49
+ 'run_once',
50
+ ],
51
+ 'io_utils': [
52
+ 'OutputsStructure',
53
+ 'csv_to_list',
54
+ 'json_to_dict',
55
+ 'yaml_to_dict',
56
+ 'get_latest_folder_path',
57
+ 'gen_hash',
58
+ 'dict_to_yaml',
59
+ 'csv_to_jsonl',
60
+ 'jsonl_to_csv',
61
+ 'jsonl_to_list',
62
+ 'gen_hash',
63
+ 'get_valid_list',
64
+ 'safe_filename',
65
+ 'thread_safe',
66
+ ],
67
+ 'deprecation_utils': [
68
+ 'deprecated',
69
+ ],
70
+ 'logger': [
71
+ 'get_logger',
72
+ 'configure_logging',
73
+ ],
74
+ }
75
+
76
+ import sys
77
+
78
+ sys.modules[__name__] = _LazyModule(
79
+ __name__,
80
+ globals()['__file__'],
81
+ _import_structure,
82
+ module_spec=__spec__,
83
+ extra_objects={},
84
+ )
@@ -0,0 +1,64 @@
1
+ import json
2
+ from argparse import Namespace
3
+ from inspect import signature
4
+
5
+ from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
6
+
7
+
8
+ class BaseArgument:
9
+ """
10
+ BaseArgument is a base class designed to facilitate the creation and manipulation
11
+ of argument classes in the evalscope framework. It provides utility methods for
12
+ instantiating objects from various data formats and converting objects back into
13
+ dictionary representations.
14
+ """
15
+
16
+ @classmethod
17
+ def from_dict(cls, d: dict):
18
+ """Instantiate the class from a dictionary."""
19
+ return cls(**d)
20
+
21
+ @classmethod
22
+ def from_json(cls, json_file: str):
23
+ """Instantiate the class from a JSON file."""
24
+ return cls.from_dict(json_to_dict(json_file))
25
+
26
+ @classmethod
27
+ def from_yaml(cls, yaml_file: str):
28
+ """Instantiate the class from a YAML file."""
29
+ return cls.from_dict(yaml_to_dict(yaml_file))
30
+
31
+ @classmethod
32
+ def from_args(cls, args: Namespace):
33
+ """
34
+ Instantiate the class from an argparse.Namespace object.
35
+ Filters out None values and removes 'func' if present.
36
+ """
37
+ args_dict = {k: v for k, v in vars(args).items() if v is not None}
38
+
39
+ if 'func' in args_dict:
40
+ del args_dict['func'] # Note: compat CLI arguments
41
+
42
+ return cls.from_dict(args_dict)
43
+
44
+ def to_dict(self):
45
+ """Convert the instance to a dictionary."""
46
+ result = self.__dict__.copy()
47
+ return result
48
+
49
+ def __str__(self):
50
+ """Return a JSON-formatted string representation of the instance."""
51
+ return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
52
+
53
+
54
+ def parse_int_or_float(num):
55
+ number = float(num)
56
+ if number.is_integer():
57
+ return int(number)
58
+ return number
59
+
60
+
61
+ def get_supported_params(func):
62
+ """Get the supported parameters of a function."""
63
+ sig = signature(func)
64
+ return set(sig.parameters.keys())
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import time
3
- import torch
4
3
  from contextlib import contextmanager
5
4
  from functools import partial
6
5
  from pydantic import BaseModel, Field
@@ -32,6 +31,7 @@ class ModelList(BaseModel):
32
31
  class ChatMessage(BaseModel):
33
32
  role: Literal['user', 'assistant', 'system']
34
33
  content: str
34
+ reasoning_content: Optional[str] = None
35
35
 
36
36
 
37
37
  class DeltaMessage(BaseModel):
@@ -63,10 +63,10 @@ class ChatCompletionResponseStreamChoice(BaseModel):
63
63
 
64
64
  class ChatCompletionResponse(BaseModel):
65
65
  model: str
66
- object: Literal['chat.completion', 'chat.completion.chunk']
66
+ object: Literal['chat.completion', 'chat.completion.chunk', 'images.generations']
67
67
  choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]]
68
68
  created: Optional[int] = Field(default_factory=lambda: int(time.time()))
69
- usage: Optional[Usage]
69
+ usage: Optional[Usage] = None
70
70
 
71
71
 
72
72
  class TextCompletionRequest(BaseModel):
@@ -94,6 +94,7 @@ class TextCompletionResponse(BaseModel):
94
94
  class ChatService:
95
95
 
96
96
  def __init__(self, model_path, attn_implementation):
97
+ import torch
97
98
  from modelscope import AutoModelForCausalLM, AutoTokenizer
98
99
  from transformers import TextIteratorStreamer
99
100
 
@@ -174,7 +175,7 @@ class ChatService:
174
175
  )
175
176
 
176
177
  def _prepare_text_inputs(self, request: TextCompletionRequest):
177
- inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=True).to(self.device)
178
+ inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=False).to(self.device)
178
179
  prompt_tokens = len(inputs['input_ids'][0])
179
180
  return inputs, prompt_tokens
180
181
 
@@ -203,8 +204,9 @@ class ChatService:
203
204
 
204
205
  def _prepare_chat_inputs(self, request: ChatCompletionRequest):
205
206
  formatted_prompt = self.tokenizer.apply_chat_template(
206
- request.messages, tokenize=False, add_generation_prompt=True)
207
- inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
207
+ request.messages, tokenize=False, add_generation_prompt=True
208
+ )
209
+ inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=False).to(self.device)
208
210
  prompt_tokens = len(inputs['input_ids'][0])
209
211
  return formatted_prompt, inputs, prompt_tokens
210
212
 
@@ -0,0 +1,53 @@
1
+ import functools
2
+ import inspect
3
+ import os
4
+ from typing import Callable, Optional
5
+
6
+ from .logger import get_logger
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optional[str] = None) -> Callable:
12
+ """
13
+ Decorator to mark functions as deprecated.
14
+
15
+ :param since: String indicating the version since deprecation
16
+ :param remove_in: Optional string indicating the version when it will be removed
17
+ :param alternative: Optional string suggesting an alternative
18
+ :return: Decorated function
19
+ """
20
+
21
+ def decorator(func: Callable) -> Callable:
22
+
23
+ @functools.wraps(func)
24
+ def wrapper(*args, **kwargs):
25
+ # Get the file name where the function is defined
26
+ file_name = os.path.basename(inspect.getfile(func))
27
+
28
+ # Construct the warning message
29
+ warning_parts = [
30
+ f'{func.__name__} in {file_name} has been deprecated since version {since}',
31
+ f'and will be removed in version {remove_in}' if remove_in else None,
32
+ f'Use {alternative} instead' if alternative else None
33
+ ]
34
+ warning_message = '. '.join(filter(None, warning_parts))
35
+
36
+ # Log the warning
37
+ logger.warning(warning_message)
38
+
39
+ return func(*args, **kwargs)
40
+
41
+ return wrapper
42
+
43
+ return decorator
44
+
45
+
46
+ def deprecated_warning(logger, message: str):
47
+ """
48
+ Log a deprecation warning.
49
+
50
+ :param logger: Logger instance to log the warning
51
+ :param message: Warning message to log
52
+ """
53
+ logger.warning(f'Deprecated: {message}')