evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1 +0,0 @@
1
- from evalscope.perf.main import run_perf_benchmark
@@ -3,13 +3,14 @@ import json
3
3
  import os
4
4
  import sys
5
5
  from dataclasses import dataclass, field
6
- from typing import Any, Dict, List, Optional
6
+ from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
  from evalscope.constants import DEFAULT_WORK_DIR
9
+ from evalscope.utils import BaseArgument
9
10
 
10
11
 
11
12
  @dataclass
12
- class Arguments:
13
+ class Arguments(BaseArgument):
13
14
  # Model and API
14
15
  model: str # Model name or path
15
16
  model_id: Optional[str] = None # Model identifier
@@ -21,29 +22,46 @@ class Arguments:
21
22
  # Connection settings
22
23
  url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
23
24
  headers: Dict[str, Any] = field(default_factory=dict) # Custom headers
24
- connect_timeout: int = 120 # Connection timeout in seconds
25
- read_timeout: int = 120 # Read timeout in seconds
26
- api_key: str = 'EMPTY'
25
+ connect_timeout: int = 600 # Connection timeout in seconds
26
+ read_timeout: int = 600 # Read timeout in seconds
27
+ api_key: Optional[str] = None
28
+ no_test_connection: bool = False # Test the connection before starting the benchmark
27
29
 
28
30
  # Performance and parallelism
29
- number: Optional[int] = None # Number of requests to be made
30
- parallel: int = 1 # Number of parallel requests
31
+ number: Union[int, List[int]] = 1000 # Number of requests to be made
32
+ parallel: Union[int, List[int]] = 1 # Number of parallel requests
31
33
  rate: int = -1 # Rate limit for requests (default: -1, no limit)
34
+ sleep_interval: int = 5 # Sleep interval between performance runs, in seconds
35
+
36
+ # Tuning knobs
37
+ db_commit_interval: int = 1000 # Number of rows buffered before committing to the DB
38
+ queue_size_multiplier: int = 5 # Maxsize for queue = parallel * this multiplier
39
+ in_flight_task_multiplier: int = 2 # Max scheduled tasks = parallel * this multiplier
32
40
 
33
41
  # Logging and debugging
34
42
  log_every_n_query: int = 10 # Log every N queries
35
43
  debug: bool = False # Debug mode
36
- wandb_api_key: Optional[str] = None # WandB API key for logging
44
+ visualizer: Optional[str] = None # Visualizer for logging, supports 'swanlab' or 'wandb'
45
+ wandb_api_key: Optional[str] = None # Will be deprecated in the future
46
+ swanlab_api_key: Optional[str] = None # Will be deprecated in the future
37
47
  name: Optional[str] = None # Name for the run
38
48
 
39
49
  # Output settings
40
50
  outputs_dir: str = DEFAULT_WORK_DIR
41
51
 
42
52
  # Prompt settings
43
- max_prompt_length: int = sys.maxsize # Maximum length of the prompt
53
+ max_prompt_length: int = 131072 # Maximum length of the prompt
44
54
  min_prompt_length: int = 0 # Minimum length of the prompt
55
+ prefix_length: int = 0 # Length of the prefix, only for random dataset
45
56
  prompt: Optional[str] = None # The prompt text
46
57
  query_template: Optional[str] = None # Template for the query
58
+ apply_chat_template: Optional[bool] = None # Whether to apply chat template
59
+ # random vl settings
60
+ image_width: int = 224 # Width of the image for random VL dataset
61
+ image_height: int = 224 # Height of the image for random VL dataset
62
+ image_format: str = 'RGB' # Image format for random VL dataset
63
+ image_num: int = 1 # Number of images for random VL dataset
64
+ image_patch_size: int = 28 # Patch size for image tokenizer, only for local image token calculation
47
65
 
48
66
  # Dataset settings
49
67
  dataset: str = 'openqa' # Dataset type (default: 'line_by_line')
@@ -51,68 +69,57 @@ class Arguments:
51
69
 
52
70
  # Response settings
53
71
  frequency_penalty: Optional[float] = None # Frequency penalty for the response
72
+ repetition_penalty: Optional[float] = None # Repetition penalty for the response
54
73
  logprobs: Optional[bool] = None # Whether to log probabilities
55
74
  max_tokens: Optional[int] = 2048 # Maximum number of tokens in the response
56
75
  min_tokens: Optional[int] = None # Minimum number of tokens in the response
57
76
  n_choices: Optional[int] = None # Number of response choices
58
- seed: Optional[int] = 42 # Random seed for reproducibility
59
- stop: Optional[List[str]] = field(default_factory=list) # Stop sequences for the response
60
- stop_token_ids: Optional[List[str]] = field(default_factory=list) # Stop token IDs for the response
61
- stream: Optional[bool] = None # Whether to stream the response
62
- temperature: Optional[float] = None # Temperature setting for the response
77
+ seed: Optional[int] = None # Random seed for reproducibility
78
+ stop: Optional[List[str]] = None # Stop sequences for the response
79
+ stop_token_ids: Optional[List[str]] = None # Stop token IDs for the response
80
+ stream: Optional[bool] = True # Whether to stream the response
81
+ temperature: float = 0.0 # Temperature setting for the response
63
82
  top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
64
-
65
- @staticmethod
66
- def from_args(args):
67
- return Arguments(
68
- model=args.model,
69
- attn_implementation=args.attn_implementation,
70
- url=args.url,
71
- port=args.port,
72
- api_key=args.api_key,
73
- connect_timeout=args.connect_timeout,
74
- read_timeout=args.read_timeout,
75
- number=args.number,
76
- parallel=args.parallel,
77
- rate=args.rate,
78
- log_every_n_query=args.log_every_n_query,
79
- headers=args.headers,
80
- wandb_api_key=args.wandb_api_key,
81
- name=args.name,
82
- outputs_dir=args.outputs_dir,
83
- debug=args.debug,
84
- tokenizer_path=args.tokenizer_path,
85
- api=args.api,
86
- max_prompt_length=args.max_prompt_length,
87
- min_prompt_length=args.min_prompt_length,
88
- prompt=args.prompt,
89
- query_template=args.query_template,
90
- dataset=args.dataset,
91
- dataset_path=args.dataset_path,
92
- frequency_penalty=args.frequency_penalty,
93
- logprobs=args.logprobs,
94
- max_tokens=args.max_tokens,
95
- min_tokens=args.min_tokens,
96
- n_choices=args.n_choices,
97
- seed=args.seed,
98
- stop=args.stop,
99
- stop_token_ids=args.stop_token_ids,
100
- stream=args.stream,
101
- temperature=args.temperature,
102
- top_p=args.top_p)
83
+ top_k: Optional[int] = None # Top-k sampling setting for the response
84
+ extra_args: Optional[Dict[str, Any]] = None # Extra arguments
103
85
 
104
86
  def __post_init__(self):
87
+ # Set the default headers
105
88
  self.headers = self.headers or {} # Default to empty dictionary
106
89
  if self.api_key:
107
90
  # Assuming the API key is used as a Bearer token
108
91
  self.headers['Authorization'] = f'Bearer {self.api_key}'
109
- self.model_id = os.path.basename(self.model)
110
92
 
111
- def __str__(self):
112
- return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
93
+ # Set the model ID based on the model name
94
+ self.model_id = os.path.basename(self.model)
113
95
 
114
- def to_dict(self) -> Dict[str, Any]:
115
- return self.__dict__
96
+ # Set the URL based on the dataset type
97
+ if self.api.startswith('local'):
98
+ if self.dataset.startswith('speed_benchmark'):
99
+ self.url = f'http://127.0.0.1:{self.port}/v1/completions'
100
+ else:
101
+ self.url = f'http://127.0.0.1:{self.port}/v1/chat/completions'
102
+
103
+ # Set the apply_chat_template flag based on the URL
104
+ if self.apply_chat_template is None:
105
+ self.apply_chat_template = self.url.strip('/').endswith('chat/completions')
106
+
107
+ # Set number and parallel to lists if they are integers
108
+ if isinstance(self.number, int):
109
+ self.number = [self.number]
110
+ if isinstance(self.parallel, int):
111
+ self.parallel = [self.parallel]
112
+ assert len(self.number) == len(
113
+ self.parallel
114
+ ), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
115
+
116
+ # Validate tuning knobs
117
+ if self.db_commit_interval <= 0:
118
+ self.db_commit_interval = 1
119
+ if self.queue_size_multiplier <= 0:
120
+ self.queue_size_multiplier = 1
121
+ if self.in_flight_task_multiplier <= 0:
122
+ self.in_flight_task_multiplier = 1
116
123
 
117
124
 
118
125
  class ParseKVAction(argparse.Action):
@@ -122,7 +129,13 @@ class ParseKVAction(argparse.Action):
122
129
  setattr(namespace, self.dest, {})
123
130
  else:
124
131
  try:
125
- kv_dict = dict(kv.split('=') for kv in values)
132
+ kv_dict = {}
133
+ for kv in values:
134
+ parts = kv.split('=', 1) # only split the first '='
135
+ if len(parts) != 2:
136
+ raise ValueError(f'Invalid key-value pair: {kv}')
137
+ key, value = parts
138
+ kv_dict[key.strip()] = value.strip()
126
139
  setattr(namespace, self.dest, kv_dict)
127
140
  except ValueError as e:
128
141
  parser.error(f'Error parsing key-value pairs: {e}')
@@ -141,26 +154,45 @@ def add_argument(parser: argparse.ArgumentParser):
141
154
  parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
142
155
  parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
143
156
  parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
144
- parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
145
- parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')
146
- parser.add_argument('--read-timeout', type=int, default=120, help='The network read timeout')
157
+ parser.add_argument('--api-key', type=str, required=False, default=None, help='The API key for authentication')
158
+ parser.add_argument('--connect-timeout', type=int, default=600, help='The network connection timeout')
159
+ parser.add_argument('--read-timeout', type=int, default=600, help='The network read timeout')
160
+ parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
147
161
 
148
162
  # Performance and parallelism
149
- parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
150
- parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
163
+ parser.add_argument('-n', '--number', type=int, default=1000, nargs='+', help='How many requests to be made')
164
+ parser.add_argument('--parallel', type=int, default=1, nargs='+', help='Set number of concurrency requests, default 1') # noqa: E501
151
165
  parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
166
+ parser.add_argument(
167
+ '--sleep-interval', type=int, default=5, help='Sleep interval between performance runs, in seconds. Default 5') # noqa: E501
168
+
169
+ # Tuning knobs
170
+ parser.add_argument('--db-commit-interval', type=int, default=1000, help='Rows buffered before SQLite commit')
171
+ parser.add_argument('--queue-size-multiplier', type=int, default=5, help='Queue maxsize = parallel * multiplier')
172
+ parser.add_argument('--in-flight-task-multiplier', type=int, default=2, help='Max scheduled tasks = parallel * multiplier') # noqa: E501
152
173
 
153
174
  # Logging and debugging
154
175
  parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
155
176
  parser.add_argument('--debug', action='store_true', default=False, help='Debug request send')
177
+ parser.add_argument('--visualizer', type=str, default=None, help='The visualizer to use, default None')
156
178
  parser.add_argument('--wandb-api-key', type=str, default=None, help='The wandb API key')
157
- parser.add_argument('--name', type=str, help='The wandb db result name and result db name')
179
+ parser.add_argument('--swanlab-api-key', type=str, default=None, help='The swanlab API key')
180
+ parser.add_argument('--name', type=str, help='The wandb/swanlab db result name and result db name')
158
181
 
159
182
  # Prompt settings
160
183
  parser.add_argument('--max-prompt-length', type=int, default=sys.maxsize, help='Maximum input prompt length')
161
184
  parser.add_argument('--min-prompt-length', type=int, default=0, help='Minimum input prompt length')
185
+ parser.add_argument('--prefix-length', type=int, default=0, help='The prefix length')
162
186
  parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
163
187
  parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
188
+ parser.add_argument(
189
+ '--apply-chat-template', type=argparse.BooleanOptionalAction, default=None, help='Apply chat template to the prompt') # noqa: E501
190
+ # random vl settings
191
+ parser.add_argument('--image-width', type=int, default=224, help='Width of the image for random VL dataset')
192
+ parser.add_argument('--image-height', type=int, default=224, help='Height of the image for random VL dataset')
193
+ parser.add_argument('--image-format', type=str, default='RGB', help='Image format for random VL dataset')
194
+ parser.add_argument('--image-num', type=int, default=1, help='Number of images for random VL dataset')
195
+ parser.add_argument('--image-patch-size', type=int, default=28, help='Patch size for image tokenizer, only for local image token calculation') # noqa: E501
164
196
 
165
197
  # Output settings
166
198
  parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
@@ -171,19 +203,21 @@ def add_argument(parser: argparse.ArgumentParser):
171
203
 
172
204
  # Response settings
173
205
  parser.add_argument('--frequency-penalty', type=float, help='The frequency_penalty value', default=None)
206
+ parser.add_argument('--repetition-penalty', type=float, help='The repetition_penalty value', default=None)
174
207
  parser.add_argument('--logprobs', action='store_true', help='The logprobs', default=None)
175
208
  parser.add_argument(
176
209
  '--max-tokens', type=int, help='The maximum number of tokens that can be generated', default=2048)
177
210
  parser.add_argument(
178
211
  '--min-tokens', type=int, help='The minimum number of tokens that can be generated', default=None)
179
212
  parser.add_argument('--n-choices', type=int, help='How many completion choices to generate', default=None)
180
- parser.add_argument('--seed', type=int, help='The random seed', default=42)
213
+ parser.add_argument('--seed', type=int, help='The random seed', default=None)
181
214
  parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
182
215
  parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
183
- parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=None)
184
- parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
216
+ parser.add_argument('--stream', action=argparse.BooleanOptionalAction, help='Stream output with SSE', default=True)
217
+ parser.add_argument('--temperature', type=float, help='The sample temperature', default=0.0)
185
218
  parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
186
-
219
+ parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
220
+ parser.add_argument('--extra-args', type=json.loads, default='{}', help='Extra arguments, should in JSON format',)
187
221
  # yapf: enable
188
222
 
189
223
 
@@ -1,243 +1,194 @@
1
1
  import asyncio
2
- import copy
3
2
  import json
4
3
  import numpy as np
5
- import os
6
4
  import platform
7
5
  import sqlite3
8
- import threading
9
- import time
10
- from http import HTTPStatus
11
6
  from tqdm import tqdm
12
- from typing import List
13
-
14
- from evalscope.perf.arguments import Arguments
15
- from evalscope.perf.http_client import AioHttpClient, test_connection
16
- from evalscope.perf.plugin.registry import ApiRegistry, DatasetRegistry
17
- from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
18
- from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
19
- from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
20
- from evalscope.perf.utils.local_server import start_app
7
+ from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
8
+
21
9
  from evalscope.utils.logger import get_logger
10
+ from .arguments import Arguments
11
+ from .http_client import AioHttpClient, test_connection
12
+ from .plugin import ApiRegistry, DatasetRegistry
13
+ from .utils.benchmark_util import BenchmarkData, BenchmarkMetrics
14
+ from .utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, load_prompt, summary_result
15
+ from .utils.handler import add_signal_handlers, exception_handler
16
+
17
+ if TYPE_CHECKING:
18
+ from .plugin import ApiPluginBase, DatasetPluginBase
22
19
 
23
20
  logger = get_logger()
24
- query_send_completed_event = asyncio.Event()
21
+
25
22
  data_process_completed_event = asyncio.Event()
26
23
 
27
24
 
28
25
  @exception_handler
29
- async def dispatch_requests_worker(request_queue: asyncio.Queue, args: Arguments):
30
- query_generator_class = ApiRegistry(args.api)
31
- query_generator = query_generator_class(args.tokenizer_path)
32
-
33
- def load_prompt(prompt_path_or_text):
34
- """Load the prompt from a file or directly from the input text."""
35
- if prompt_path_or_text.startswith('@'):
36
- with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
37
- return file.read()
38
- return prompt_path_or_text
39
-
40
- async def dispatch_request(request):
41
- """Dispatch a single request with optional rate limiting."""
42
- await request_queue.put(request)
43
- if args.rate != -1:
44
- interval = np.random.exponential(1.0 / args.rate)
45
- await asyncio.sleep(interval)
26
+ async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGenerator[dict, None]:
46
27
 
47
- async def dispatch_requests_from_prompt(messages):
48
- """Generate and dispatch requests based on the given prompt."""
49
- request = query_generator.build_request(messages, args)
50
- if args.number is None:
51
- await dispatch_request(request)
52
- return 1
28
+ async def generate_requests_from_prompt():
29
+ prompt = load_prompt(args.prompt)
30
+ messages = [{'role': 'user', 'content': prompt}] if args.apply_chat_template else prompt
31
+ request = api_plugin.build_request(messages)
53
32
  for _ in range(args.number):
54
- await dispatch_request(request)
55
- return args.number
33
+ yield request
56
34
 
57
- async def dispatch_requests_from_dataset():
58
- """Generate and dispatch requests based on the dataset."""
59
- total_query_count = 0
60
- message_generator_class = DatasetRegistry(args.dataset)
35
+ async def generate_requests_from_dataset():
36
+ message_generator_class = DatasetRegistry.get_class(args.dataset)
61
37
  message_generator = message_generator_class(args)
62
38
 
63
- for messages in message_generator:
64
- request = query_generator.build_request(messages, args)
65
- if request is None:
66
- continue
67
- await dispatch_request(request)
68
- total_query_count += 1
69
- if args.number and total_query_count >= args.number:
70
- break
39
+ dataset_messages = []
40
+ try:
41
+ for messages in message_generator.build_messages():
42
+ dataset_messages.append(messages)
43
+ if len(dataset_messages) >= args.number:
44
+ break
45
+ except StopIteration:
46
+ pass
71
47
 
72
- return total_query_count
48
+ if not dataset_messages:
49
+ raise Exception('Dataset is empty!')
50
+
51
+ count = 0
52
+ dataset_index = 0
53
+
54
+ while count < args.number:
55
+ messages = dataset_messages[dataset_index]
56
+ request = api_plugin.build_request(messages)
57
+ if request is not None:
58
+ yield request
59
+ count += 1
60
+
61
+ dataset_index = (dataset_index + 1) % len(dataset_messages)
73
62
 
74
- # Load prompt or dataset and dispatch requests accordingly
75
63
  if args.prompt:
76
- prompt = load_prompt(args.prompt)
77
- messages = [{'role': 'user', 'content': prompt}]
78
- total_queries = await dispatch_requests_from_prompt(messages)
64
+ generator = generate_requests_from_prompt()
79
65
  elif args.dataset:
80
- total_queries = await dispatch_requests_from_dataset()
66
+ generator = generate_requests_from_dataset()
81
67
  else:
82
- raise Exception('Either prompt or dataset is required!')
68
+ raise ValueError('Either prompt or dataset is required!')
83
69
 
84
- return total_queries
70
+ async for request in generator:
71
+ yield request
72
+ if args.rate != -1:
73
+ interval = np.random.exponential(1.0 / args.rate)
74
+ await asyncio.sleep(interval)
85
75
 
86
76
 
87
77
  @exception_handler
88
- async def send_requests_worker(
89
- task_id,
90
- request_queue: asyncio.Queue,
78
+ async def send_request(
79
+ semaphore: asyncio.Semaphore,
80
+ request: dict,
91
81
  benchmark_data_queue: asyncio.Queue,
92
82
  args: Arguments,
83
+ client: AioHttpClient, # reuse shared client
93
84
  ):
94
- client = AioHttpClient(args)
95
- async with client:
96
- while not (query_send_completed_event.is_set() and request_queue.empty()):
97
- try:
98
- # Attempt to get a request from the queue with a timeout
99
- request = await asyncio.wait_for(request_queue.get(), timeout=0.0001)
100
- request_queue.task_done()
101
- except asyncio.TimeoutError:
102
- # If timeout, continue to the next iteration
103
- continue
104
-
105
- # Initialize benchmark data for the current request
106
- benchmark_data = BenchmarkData(request=request)
107
- collected_messages = []
108
- try:
109
- # Send the request and process the response
110
- async for is_error, state_code, response_data in client.post(request):
111
- if is_error or state_code != HTTPStatus.OK:
112
- logger.error(f'Request: {request} failed, state_code: {state_code}, data: {response_data}')
113
- benchmark_data.success = False
114
- break
115
- if response_data:
116
- collected_messages.append(response_data)
117
- benchmark_data.chunk_times.append(time.perf_counter())
118
- benchmark_data.success = True
119
- benchmark_data.update_gpu_usage()
120
- except Exception as e:
121
- if response_data:
122
- collected_messages.append(response_data)
123
- benchmark_data.success = False
124
- logger.exception(e)
125
- logger.error(f'Request query: {request} exception')
126
- finally:
127
- # Record completion time and collected messages
128
- benchmark_data.completed_time = time.perf_counter()
129
- benchmark_data.response_messages = collected_messages
130
- await benchmark_data_queue.put(benchmark_data)
85
+ async with semaphore:
86
+ benchmark_data = await client.post(request)
87
+ benchmark_data.update_gpu_usage()
88
+ await benchmark_data_queue.put(benchmark_data)
131
89
 
132
90
 
133
91
  @exception_handler
134
- async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue, args: Arguments):
92
+ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments, api_plugin: 'ApiPluginBase'):
135
93
  metrics = BenchmarkMetrics(concurrency=args.parallel)
136
-
137
- api_plugin_class = ApiRegistry(args.api)
138
- api_plugin = api_plugin_class(args.tokenizer_path)
139
-
140
94
  result_db_path = get_result_db_path(args)
141
- # Initialize wandb
142
- if args.wandb_api_key:
143
- import datetime
144
- import wandb
145
- os.environ['WANDB_SILENT'] = 'true'
146
- os.environ['WANDB_DIR'] = args.outputs_dir
147
-
148
- wandb.login(key=args.wandb_api_key)
149
- current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
150
- name = args.name if args.name else f'{args.model_id}_{current_time}'
151
- wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
95
+
96
+ # Stream inserts to DB to avoid accumulating all results in memory
97
+ commit_every = args.db_commit_interval
98
+ processed_since_commit = 0
152
99
 
153
100
  with sqlite3.connect(result_db_path) as con:
154
101
  cursor = con.cursor()
155
102
  create_result_table(cursor)
156
- with tqdm(desc='Processing') as pbar:
103
+
104
+ with tqdm(desc='Processing', total=args.number) as pbar:
157
105
  while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
158
106
  try:
159
- # Attempt to get benchmark data from the queue with a timeout
160
- benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
161
- benchmark_data_queue.task_done()
107
+ benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.1)
162
108
  except asyncio.TimeoutError:
163
- # If timeout, continue to the next iteration
164
109
  continue
165
110
 
166
- # Update metrics based on the benchmark data
111
+ # Update metrics and write to DB immediately
167
112
  metrics.update_metrics(benchmark_data, api_plugin)
168
-
169
- # Insert benchmark data into the database and commit the transaction
170
113
  insert_benchmark_data(cursor, benchmark_data)
171
- con.commit()
114
+ processed_since_commit += 1
115
+ if processed_since_commit >= commit_every:
116
+ con.commit()
117
+ processed_since_commit = 0
172
118
 
173
- # Create a message with the updated metrics
174
119
  message = metrics.create_message()
175
120
 
176
- # Log the message to wandb if the api key is provided
177
121
  if args.wandb_api_key:
122
+ import wandb
178
123
  wandb.log(message)
124
+ if args.swanlab_api_key:
125
+ import swanlab
126
+ swanlab.log(message)
179
127
 
180
- # Log the message to the logger every n queries
181
128
  if int(metrics.n_total_queries) % args.log_every_n_query == 0:
182
129
  msg = json.dumps(message, ensure_ascii=False, indent=2)
183
130
  logger.info(msg)
184
131
 
185
- pbar.update(1) # Update the progress bar
132
+ benchmark_data_queue.task_done()
133
+ pbar.update(1)
134
+
135
+ con.commit()
186
136
 
187
137
  return metrics, result_db_path
188
138
 
189
139
 
190
140
  @exception_handler
191
- async def start_server(args: Arguments) -> bool:
192
- if args.api.startswith('local'):
193
- # start local server
194
- server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
195
- server.start()
196
-
197
- if args.dataset.startswith('speed_benchmark'):
198
- args.url = f'http://127.0.0.1:{args.port}/v1/completions'
199
- else:
200
- args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
201
-
202
- if not await test_connection(args):
141
+ async def connect_test(args: Arguments, api_plugin) -> bool:
142
+ if (not args.no_test_connection) and (not await test_connection(args, api_plugin)):
203
143
  raise TimeoutError('Test connection failed')
204
144
 
205
145
 
206
146
  @exception_handler
207
- async def benchmark(args: Arguments) -> None:
147
+ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
208
148
  if platform.system() != 'Windows':
209
149
  loop = asyncio.get_running_loop()
210
150
  add_signal_handlers(loop)
211
151
 
212
- request_queue = asyncio.Queue()
213
- benchmark_data_queue = asyncio.Queue()
152
+ api_plugin_class = ApiRegistry.get_class(args.api)
153
+ api_plugin = api_plugin_class(args)
214
154
 
215
- async def create_send_request_tasks():
216
- tasks: List[asyncio.Task] = []
217
- for idx in range(args.parallel):
218
- task = asyncio.create_task(send_requests_worker(idx, request_queue, benchmark_data_queue, args))
219
- tasks.append(task)
220
- return tasks
155
+ benchmark_data_queue: asyncio.Queue = asyncio.Queue(maxsize=max(1, args.parallel * args.queue_size_multiplier))
156
+ data_process_completed_event.clear()
221
157
 
222
- async def run_tasks():
223
- await start_server(args)
158
+ # test connection
159
+ await connect_test(args, api_plugin)
224
160
 
225
- dispatch_task = asyncio.create_task(dispatch_requests_worker(request_queue, args))
161
+ # Create a single shared client session for all requests
162
+ client = AioHttpClient(args, api_plugin)
163
+ async with client:
164
+ # start statistic benchmark metric (consumer)
226
165
  statistic_benchmark_metric_task = asyncio.create_task(
227
- statistic_benchmark_metric_worker(benchmark_data_queue, args))
228
- send_request_tasks = await create_send_request_tasks()
166
+ statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
167
+ )
168
+
169
+ # start sending requests with bounded in-flight tasks
170
+ semaphore = asyncio.Semaphore(args.parallel)
171
+ in_flight: set[asyncio.Task] = set()
172
+ max_in_flight = args.parallel * args.in_flight_task_multiplier
229
173
 
230
- expected_number_of_queries = await dispatch_task
231
- await request_queue.join()
232
- query_send_completed_event.set()
174
+ async for request in get_requests(args, api_plugin):
175
+ # Keep the number of scheduled tasks bounded to avoid OOM
176
+ if len(in_flight) >= max_in_flight:
177
+ done, pending = await asyncio.wait(in_flight, return_when=asyncio.FIRST_COMPLETED)
178
+ in_flight = pending
233
179
 
234
- await asyncio.gather(*send_request_tasks, return_exceptions=True)
180
+ task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, client))
181
+ in_flight.add(task)
182
+
183
+ # Wait for remaining in-flight tasks
184
+ if in_flight:
185
+ await asyncio.gather(*in_flight, return_exceptions=True)
186
+
187
+ # Drain queue and finish
235
188
  await benchmark_data_queue.join()
236
189
  data_process_completed_event.set()
237
190
 
238
191
  metrics, result_db_path = await statistic_benchmark_metric_task
239
- summary_result(args, metrics, expected_number_of_queries, result_db_path)
240
-
241
- await asyncio.sleep(0.250)
242
192
 
243
- await run_tasks()
193
+ metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
194
+ return metrics_result, percentile_result