evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -2,11 +2,12 @@ import base64
2
2
  import json
3
3
  import os
4
4
  import pickle
5
+ import re
5
6
  import sqlite3
6
7
  import sys
7
8
  from datetime import datetime
8
9
  from tabulate import tabulate
9
- from typing import Dict, List
10
+ from typing import Dict, List, Tuple
10
11
 
11
12
  from evalscope.perf.arguments import Arguments
12
13
  from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
@@ -15,6 +16,28 @@ from evalscope.utils.logger import get_logger
15
16
  logger = get_logger()
16
17
 
17
18
 
19
+ class DatabaseColumns:
20
+ REQUEST = 'request'
21
+ START_TIME = 'start_time'
22
+ INTER_TOKEN_LATENCIES = 'inter_token_latencies'
23
+ SUCCESS = 'success'
24
+ RESPONSE_MESSAGES = 'response_messages'
25
+ COMPLETED_TIME = 'completed_time'
26
+ LATENCY = 'latency'
27
+ FIRST_CHUNK_LATENCY = 'first_chunk_latency'
28
+ PROMPT_TOKENS = 'prompt_tokens'
29
+ COMPLETION_TOKENS = 'completion_tokens'
30
+ MAX_GPU_MEMORY_COST = 'max_gpu_memory_cost'
31
+ TIME_PER_OUTPUT_TOKEN = 'time_per_output_token'
32
+
33
+
34
+ def load_prompt(prompt_path_or_text):
35
+ if prompt_path_or_text.startswith('@'):
36
+ with open(prompt_path_or_text[1:], 'r', encoding='utf-8') as file:
37
+ return file.read()
38
+ return prompt_path_or_text
39
+
40
+
18
41
  def encode_data(data) -> str:
19
42
  """Encodes data using base64 and pickle."""
20
43
  return base64.b64encode(pickle.dumps(data)).decode('utf-8')
@@ -33,32 +56,34 @@ def transpose_results(data):
33
56
 
34
57
 
35
58
  def create_result_table(cursor):
36
- cursor.execute('''CREATE TABLE IF NOT EXISTS result(
37
- request TEXT,
38
- start_time REAL,
39
- chunk_times TEXT,
40
- success INTEGER,
41
- response_messages TEXT,
42
- completed_time REAL,
43
- latency REAL,
44
- first_chunk_latency REAL,
45
- n_chunks INTEGER,
46
- chunk_time REAL,
47
- prompt_tokens INTEGER,
48
- completion_tokens INTEGER,
49
- max_gpu_memory_cost REAL)''')
59
+ cursor.execute(
60
+ f'''CREATE TABLE IF NOT EXISTS result(
61
+ {DatabaseColumns.REQUEST} TEXT,
62
+ {DatabaseColumns.START_TIME} REAL,
63
+ {DatabaseColumns.INTER_TOKEN_LATENCIES} TEXT,
64
+ {DatabaseColumns.SUCCESS} INTEGER,
65
+ {DatabaseColumns.RESPONSE_MESSAGES} TEXT,
66
+ {DatabaseColumns.COMPLETED_TIME} REAL,
67
+ {DatabaseColumns.LATENCY} REAL,
68
+ {DatabaseColumns.FIRST_CHUNK_LATENCY} REAL,
69
+ {DatabaseColumns.PROMPT_TOKENS} INTEGER,
70
+ {DatabaseColumns.COMPLETION_TOKENS} INTEGER,
71
+ {DatabaseColumns.MAX_GPU_MEMORY_COST} REAL,
72
+ {DatabaseColumns.TIME_PER_OUTPUT_TOKEN} REAL
73
+ )'''
74
+ )
50
75
 
51
76
 
52
77
  def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
53
- request = encode_data(benchmark_data.request)
54
- chunk_times = json.dumps(benchmark_data.chunk_times)
78
+ request = benchmark_data.request
79
+ inter_token_latencies = json.dumps(benchmark_data.inter_chunk_latency)
55
80
  response_messages = encode_data(benchmark_data.response_messages)
56
81
 
57
82
  # Columns common to both success and failure cases
58
83
  common_columns = (
59
84
  request,
60
85
  benchmark_data.start_time,
61
- chunk_times,
86
+ inter_token_latencies,
62
87
  benchmark_data.success,
63
88
  response_messages,
64
89
  benchmark_data.completed_time,
@@ -67,23 +92,21 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
67
92
  if benchmark_data.success:
68
93
  # Add additional columns for success case
69
94
  additional_columns = (
70
- benchmark_data.query_latency,
71
- benchmark_data.first_chunk_latency,
72
- benchmark_data.n_chunks,
73
- benchmark_data.n_chunks_time,
74
- benchmark_data.prompt_tokens,
75
- benchmark_data.completion_tokens,
76
- benchmark_data.max_gpu_memory_cost,
95
+ benchmark_data.query_latency, benchmark_data.first_chunk_latency, benchmark_data.prompt_tokens,
96
+ benchmark_data.completion_tokens, benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token
77
97
  )
78
- query = """INSERT INTO result(
79
- request, start_time, chunk_times, success, response_messages,
80
- completed_time, latency, first_chunk_latency,
81
- n_chunks, chunk_time, prompt_tokens, completion_tokens, max_gpu_memory_cost
82
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
98
+ query = f"""INSERT INTO result(
99
+ {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
100
+ {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
101
+ {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY}, {DatabaseColumns.PROMPT_TOKENS},
102
+ {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.MAX_GPU_MEMORY_COST},
103
+ {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
104
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
83
105
  cursor.execute(query, common_columns + additional_columns)
84
106
  else:
85
- query = """INSERT INTO result(
86
- request, start_time, chunk_times, success, response_messages, completed_time
107
+ query = f"""INSERT INTO result(
108
+ {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
109
+ {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME}
87
110
  ) VALUES (?, ?, ?, ?, ?, ?)"""
88
111
  cursor.execute(query, common_columns)
89
112
 
@@ -91,6 +114,8 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
91
114
  def get_output_path(args: Arguments) -> str:
92
115
  current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
93
116
  output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
117
+ # Filter illegal characters
118
+ output_path = re.sub(r'[<>:"|?*]', '_', output_path)
94
119
  if not os.path.exists(output_path):
95
120
  os.makedirs(output_path, exist_ok=True)
96
121
  logger.info(f'Save the result to: {output_path}')
@@ -102,12 +127,24 @@ def get_result_db_path(args: Arguments):
102
127
 
103
128
  logger.info(f'Save the data base to: {result_db_path}')
104
129
  if os.path.exists(result_db_path):
105
- logger.warning('The db file exists, delete it and start again!.')
130
+ logger.error(f'The db file {result_db_path} exists, delete it and start again!.')
106
131
  sys.exit(1)
107
132
 
108
133
  return result_db_path
109
134
 
110
135
 
136
+ class PercentileMetrics:
137
+ TTFT = 'TTFT (s)'
138
+ ITL = 'ITL (s)'
139
+ TPOT = 'TPOT (s)'
140
+ LATENCY = 'Latency (s)'
141
+ INPUT_TOKENS = 'Input tokens'
142
+ OUTPUT_TOKENS = 'Output tokens'
143
+ OUTPUT_THROUGHPUT = 'Output (tok/s)'
144
+ TOTAL_THROUGHPUT = 'Total (tok/s)'
145
+ PERCENTILES = 'Percentiles'
146
+
147
+
111
148
  def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
112
149
  """
113
150
  Calculate the percentiles for a specific list of data.
@@ -136,54 +173,51 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
136
173
  :param result_db_path: Path to the SQLite database file.
137
174
  :return: Dictionary of percentiles for various metrics.
138
175
  """
139
-
140
- def inter_token_latencies(chunk_times_json: str) -> List[float]:
141
- try:
142
- chunk_times = json.loads(chunk_times_json)
143
- return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
144
- except (json.JSONDecodeError, TypeError) as e:
145
- logger.error(f'Error parsing chunk times: {e}')
146
- return []
147
-
148
- query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
149
- 'n_chunks, chunk_time, prompt_tokens, completion_tokens '
150
- 'FROM result WHERE success=1')
176
+ query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES}, {DatabaseColumns.SUCCESS},
177
+ {DatabaseColumns.COMPLETED_TIME}, {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY},
178
+ {DatabaseColumns.PROMPT_TOKENS},
179
+ {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
180
+ FROM result WHERE {DatabaseColumns.SUCCESS}=1''' # noqa: E501
151
181
 
152
182
  percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
153
183
 
154
184
  with sqlite3.connect(result_db_path) as con:
155
- rows = con.execute(query_sql).fetchall()
156
-
157
- if len(rows) < len(percentiles):
158
- logger.info('Too little data to calculate quantiles!')
159
- return {}
185
+ cursor = con.cursor()
186
+ cursor.execute(query_sql)
187
+ columns = [description[0] for description in cursor.description]
188
+ rows = cursor.fetchall()
160
189
 
161
- # Define index variables for columns
162
- CHUNK_TIMES_INDEX = 1
163
- LATENCY_INDEX = 4
164
- FIRST_CHUNK_LATENCY_INDEX = 5
165
- PROMPT_TOKENS_INDEX = 8
166
- COMPLETION_TOKENS_INDEX = 9
190
+ # Create column index mapping
191
+ col_indices = {col: idx for idx, col in enumerate(columns)}
167
192
 
168
193
  # Prepare data for each metric
169
194
  inter_token_latencies_all = []
170
195
  for row in rows:
171
- inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
196
+ try:
197
+ itl = json.loads(row[col_indices[DatabaseColumns.INTER_TOKEN_LATENCIES]]) or []
198
+ inter_token_latencies_all.extend(itl)
199
+ except (json.JSONDecodeError, TypeError) as e:
200
+ logger.error(f'Error parsing inter token latencies: {e}')
172
201
 
173
202
  metrics = {
174
- 'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
175
- 'TPOT (s)':
203
+ PercentileMetrics.TTFT: [row[col_indices[DatabaseColumns.FIRST_CHUNK_LATENCY]] for row in rows],
204
+ PercentileMetrics.ITL:
176
205
  inter_token_latencies_all,
177
- 'Latency (s)': [row[LATENCY_INDEX] for row in rows],
178
- 'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
179
- 'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
180
- 'Throughput(tokens/s)':
181
- [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
182
- for row in rows]
206
+ PercentileMetrics.TPOT: [row[col_indices[DatabaseColumns.TIME_PER_OUTPUT_TOKEN]] for row in rows],
207
+ PercentileMetrics.LATENCY: [row[col_indices[DatabaseColumns.LATENCY]] for row in rows],
208
+ PercentileMetrics.INPUT_TOKENS: [row[col_indices[DatabaseColumns.PROMPT_TOKENS]] for row in rows],
209
+ PercentileMetrics.OUTPUT_TOKENS: [row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] for row in rows],
210
+ PercentileMetrics.OUTPUT_THROUGHPUT:
211
+ [(row[col_indices[DatabaseColumns.COMPLETION_TOKENS]] / row[col_indices[DatabaseColumns.LATENCY]])
212
+ if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows],
213
+ PercentileMetrics.TOTAL_THROUGHPUT:
214
+ [((row[col_indices[DatabaseColumns.PROMPT_TOKENS]] + row[col_indices[DatabaseColumns.COMPLETION_TOKENS]])
215
+ / row[col_indices[DatabaseColumns.LATENCY]])
216
+ if row[col_indices[DatabaseColumns.LATENCY]] > 0 else float('nan') for row in rows]
183
217
  }
184
218
 
185
219
  # Calculate percentiles for each metric
186
- results = {'Percentile': [f'{p}%' for p in percentiles]}
220
+ results = {PercentileMetrics.PERCENTILES: [f'{p}%' for p in percentiles]}
187
221
  for metric_name, data in metrics.items():
188
222
  metric_percentiles = calculate_percentiles(data, percentiles)
189
223
  results[metric_name] = [metric_percentiles[p] for p in percentiles]
@@ -191,16 +225,15 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
191
225
  return results
192
226
 
193
227
 
194
- def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):
228
+ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str) -> Tuple[Dict, Dict]:
195
229
  result_path = os.path.dirname(result_db_path)
196
230
  write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
197
231
 
198
- data = metrics.create_message()
199
- data.update({'Expected number of requests': expected_number_of_queries, 'Result DB path': result_db_path})
200
- write_json_file(data, os.path.join(result_path, 'benchmark_summary.json'))
232
+ metrics_result = metrics.create_message()
233
+ write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
201
234
 
202
235
  # Print summary in a table
203
- table = tabulate(list(data.items()), headers=['Key', 'Value'], tablefmt='grid')
236
+ table = tabulate(list(metrics_result.items()), headers=['Key', 'Value'], tablefmt='grid')
204
237
  logger.info('\nBenchmarking summary:\n' + table)
205
238
 
206
239
  # Get percentile results
@@ -214,20 +247,24 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_o
214
247
  if args.dataset.startswith('speed_benchmark'):
215
248
  speed_benchmark_result(result_db_path)
216
249
 
250
+ logger.info(f'Save the summary to: {result_path}')
251
+
252
+ return metrics_result, percentile_result
253
+
217
254
 
218
255
  def speed_benchmark_result(result_db_path: str):
219
- query_sql = """
256
+ query_sql = f"""
220
257
  SELECT
221
- prompt_tokens,
222
- ROUND(AVG(completion_tokens / latency), 2) AS avg_completion_token_per_second,
223
- ROUND(AVG(max_gpu_memory_cost), 2)
258
+ {DatabaseColumns.PROMPT_TOKENS},
259
+ ROUND(AVG({DatabaseColumns.COMPLETION_TOKENS} / {DatabaseColumns.LATENCY}), 2) AS avg_completion_token_per_second,
260
+ ROUND(AVG({DatabaseColumns.MAX_GPU_MEMORY_COST}), 2)
224
261
  FROM
225
262
  result
226
263
  WHERE
227
- success = 1 AND latency > 0
264
+ {DatabaseColumns.SUCCESS} = 1 AND {DatabaseColumns.LATENCY} > 0
228
265
  GROUP BY
229
- prompt_tokens
230
- """
266
+ {DatabaseColumns.PROMPT_TOKENS}
267
+ """ # noqa: E501
231
268
 
232
269
  with sqlite3.connect(result_db_path) as con:
233
270
  cursor = con.cursor()
@@ -1,68 +1,28 @@
1
1
  import os
2
2
  import subprocess
3
- import torch
4
3
  import uvicorn
5
4
  from contextlib import asynccontextmanager
6
- from dataclasses import dataclass
7
5
  from fastapi import FastAPI
8
6
  from fastapi.middleware.cors import CORSMiddleware
9
7
  from sse_starlette.sse import EventSourceResponse
10
8
 
11
9
  from evalscope.perf.arguments import Arguments
12
10
  from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
11
+ from evalscope.utils.import_utils import check_import
13
12
  from evalscope.utils.logger import get_logger
14
13
 
15
14
  logger = get_logger()
16
15
 
17
16
 
18
- @dataclass
19
- class ServerSentEvent(object):
20
-
21
- def __init__(self, data='', event=None, id=None, retry=None):
22
- self.data = data
23
- self.event = event
24
- self.id = id
25
- self.retry = retry
26
-
27
- @classmethod
28
- def decode(cls, line):
29
- """Decode line to ServerSentEvent
30
-
31
-
32
- Args:
33
- line (str): The line.
34
-
35
- Return:
36
- ServerSentEvent (obj:`ServerSentEvent`): The ServerSentEvent object.
37
-
38
- """
39
- if not line:
40
- return None
41
- sse_msg = cls()
42
- # format data:xxx
43
- field_type, _, field_value = line.partition(':')
44
- if field_value.startswith(' '): # compatible with openai api
45
- field_value = field_value[1:]
46
- if field_type == 'event':
47
- sse_msg.event = field_value
48
- elif field_type == 'data':
49
- field_value = field_value.rstrip()
50
- sse_msg.data = field_value
51
- elif field_type == 'id':
52
- sse_msg.id = field_value
53
- elif field_type == 'retry':
54
- sse_msg.retry = field_value
55
- else:
56
- pass
57
-
58
- return sse_msg
59
-
60
-
61
17
  @asynccontextmanager
62
18
  async def lifespan(app: FastAPI):
63
19
  yield
64
- if torch.cuda.is_available():
65
- torch.cuda.empty_cache()
20
+ try:
21
+ import torch
22
+ if torch.cuda.is_available():
23
+ torch.cuda.empty_cache()
24
+ except ImportError:
25
+ pass
66
26
 
67
27
 
68
28
  def create_app(model, attn_implementation=None) -> FastAPI:
@@ -96,11 +56,16 @@ def create_app(model, attn_implementation=None) -> FastAPI:
96
56
 
97
57
 
98
58
  def start_app(args: Arguments):
59
+ logger.info('Starting local server, please wait...')
99
60
  if args.api == 'local':
61
+ check_import('torch', 'torch', raise_error=True)
62
+
100
63
  app = create_app(args.model, args.attn_implementation)
101
64
  uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
102
65
 
103
66
  elif args.api == 'local_vllm':
67
+ import torch
68
+
104
69
  os.environ['VLLM_USE_MODELSCOPE'] = 'True'
105
70
  os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
106
71
  os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
@@ -0,0 +1,63 @@
1
+ import os
2
+
3
+ from evalscope.perf.arguments import Arguments
4
+
5
+
6
+ def init_wandb(args: Arguments) -> None:
7
+ """
8
+ Initialize WandB for logging.
9
+ """
10
+ # Initialize wandb if the api key is provided
11
+ import datetime
12
+ try:
13
+ import wandb
14
+ except ImportError:
15
+ raise RuntimeError('Cannot import wandb. Please install it with command: \n pip install wandb')
16
+ os.environ['WANDB_SILENT'] = 'true'
17
+ os.environ['WANDB_DIR'] = args.outputs_dir
18
+ current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
19
+ name = args.name if args.name else f'{args.model_id}_{current_time}'
20
+
21
+ # Remove sensitive information from logging config
22
+ logging_config = args.to_dict()
23
+ logging_config.pop('api_key', None)
24
+ logging_config.pop('wandb_api_key', None)
25
+
26
+ if args.wandb_api_key is not None:
27
+ wandb.login(key=args.wandb_api_key)
28
+ wandb.init(project='perf_benchmark', name=name, config=logging_config)
29
+
30
+
31
+ def init_swanlab(args: Arguments) -> None:
32
+ """
33
+ Initialize SwanLab for logging.
34
+ """
35
+ import datetime
36
+ try:
37
+ import swanlab
38
+ except ImportError:
39
+ raise RuntimeError('Cannot import swanlab. Please install it with command: \n pip install swanlab')
40
+ os.environ['SWANLAB_LOG_DIR'] = args.outputs_dir
41
+ current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
42
+ name = args.name if args.name else f'{args.model_id}_{current_time}'
43
+ swanlab.config.update({'framework': '📏evalscope'})
44
+
45
+ # Remove sensitive information from logging config
46
+ logging_config = args.to_dict()
47
+ logging_config.pop('api_key', None)
48
+ logging_config.pop('swanlab_api_key', None)
49
+
50
+ init_kwargs = {
51
+ 'project': os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
52
+ 'name': name,
53
+ 'config': logging_config,
54
+ 'mode': 'local' if args.swanlab_api_key == 'local' else None
55
+ }
56
+
57
+ workspace = os.getenv('SWANLAB_WORKSPACE')
58
+ if workspace:
59
+ init_kwargs['workspace'] = workspace
60
+
61
+ if isinstance(args.swanlab_api_key, str) and not args.swanlab_api_key == 'local':
62
+ swanlab.login(api_key=args.swanlab_api_key)
63
+ swanlab.init(**init_kwargs)
@@ -0,0 +1,192 @@
1
+ # the following code is largely adapted from https://github.com/lework/llm-benchmark
2
+
3
+ import numpy as np
4
+ from rich.console import Console
5
+ from rich.panel import Panel
6
+ from rich.style import Style
7
+ from rich.table import Table
8
+ from rich.text import Text
9
+
10
+ from evalscope.utils.logger import get_logger
11
+ from .benchmark_util import Metrics
12
+ from .db_util import PercentileMetrics
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ def analyze_results(all_results):
18
+ """Analyze all test results and generate a summary report"""
19
+ summary = []
20
+ total_tokens = 0
21
+ total_time = 0
22
+
23
+ for result in all_results:
24
+ total_metrics = result[0]
25
+ percentile_metrics = result[1]
26
+ percentiles = percentile_metrics[PercentileMetrics.PERCENTILES]
27
+ try:
28
+ concurrency = total_metrics.get(Metrics.NUMBER_OF_CONCURRENCY, 0)
29
+ rps = total_metrics.get(Metrics.REQUEST_THROUGHPUT, 0)
30
+ avg_latency = total_metrics.get(Metrics.AVERAGE_LATENCY, 0)
31
+ p99_latency = percentile_metrics.get(PercentileMetrics.LATENCY)[percentiles.index('99%')]
32
+ avg_tps = total_metrics.get(Metrics.OUTPUT_TOKEN_THROUGHPUT, 0)
33
+ avg_ttft = total_metrics.get(Metrics.AVERAGE_TIME_TO_FIRST_TOKEN, 0)
34
+ p99_ttft = percentile_metrics.get(PercentileMetrics.TTFT)[percentiles.index('99%')]
35
+ success_rate = (
36
+ total_metrics.get(Metrics.SUCCEED_REQUESTS, 0) / total_metrics.get(Metrics.TOTAL_REQUESTS, 1)
37
+ ) * 100
38
+ avg_tpot = total_metrics.get(Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN, 0)
39
+ p99_tpot = percentile_metrics.get(PercentileMetrics.TPOT)[percentiles.index('99%')]
40
+
41
+ # Ensure all values are valid numbers
42
+ if any(x is None for x in [concurrency, rps, avg_latency, p99_latency, avg_tps, avg_ttft]):
43
+ logger.warning(f'Warning: Test results for concurrency {concurrency} contain invalid data, skipped')
44
+ continue
45
+
46
+ summary.append([
47
+ concurrency,
48
+ f'{rps:.2f}' if rps is not None else 'N/A',
49
+ f'{avg_latency:.3f}' if avg_latency is not None else 'N/A',
50
+ f'{p99_latency:.3f}' if p99_latency is not None else 'N/A',
51
+ f'{avg_tps:.2f}' if avg_tps is not None else 'N/A',
52
+ f'{avg_ttft:.3f}' if avg_ttft is not None else 'N/A',
53
+ f'{success_rate:.1f}%' if success_rate is not None else 'N/A',
54
+ f'{p99_ttft:.3f}' if p99_ttft is not None else 'N/A',
55
+ f'{avg_tpot:.3f}' if avg_tpot is not None else 'N/A',
56
+ f'{p99_tpot:.3f}' if p99_tpot is not None else 'N/A',
57
+ ])
58
+
59
+ total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST,
60
+ 0) * total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
61
+ total_time += total_metrics.get(Metrics.TIME_TAKEN_FOR_TESTS, 0)
62
+ except Exception as e:
63
+ logger.warning(
64
+ f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}"
65
+ )
66
+ continue
67
+
68
+ if not summary:
69
+ logger.warning('Error: No valid test result data')
70
+ return [], 0, 0
71
+
72
+ return summary, total_tokens, total_time
73
+
74
+
75
+ def print_summary(all_results, model_name):
76
+ """Print test results summary"""
77
+ summary, total_tokens, total_time = analyze_results(all_results)
78
+
79
+ if not summary:
80
+ logger.warning('No available test result data to display')
81
+ return
82
+
83
+ console = Console(width=100) # Set fixed width
84
+
85
+ # Create title panel
86
+ title = Text('Performance Test Summary Report', style='bold')
87
+ console.print(Panel(title, width=60))
88
+
89
+ # Print basic information
90
+ basic_info = Table(show_header=False, width=60)
91
+ basic_info.add_column('Name', style='cyan', width=25)
92
+ basic_info.add_column('Value', style='green', width=35)
93
+
94
+ basic_info.add_row('Model', model_name)
95
+ basic_info.add_row('Total Generated', f'{total_tokens:,} tokens')
96
+ basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds')
97
+ basic_info.add_row('Avg Output Rate', f'{total_tokens / total_time:.2f} tokens/sec')
98
+
99
+ console.print('\nBasic Information:')
100
+ console.print(basic_info)
101
+
102
+ # Create detailed performance metrics table
103
+ table = Table(
104
+ title='Detailed Performance Metrics',
105
+ show_header=True,
106
+ header_style='bold cyan',
107
+ border_style='blue',
108
+ width=100, # Set total table width
109
+ pad_edge=False, # Reduce edge padding
110
+ min_width=60, # Minimum width
111
+ )
112
+
113
+ # Add columns (set fixed column widths)
114
+ table.add_column('Conc.', justify='right', style='cyan')
115
+ table.add_column('RPS', justify='right')
116
+ table.add_column('Avg Lat.(s)', justify='right')
117
+ table.add_column('P99 Lat.(s)', justify='right')
118
+ table.add_column('Gen. toks/s', justify='right')
119
+ table.add_column('Avg TTFT(s)', justify='right')
120
+ table.add_column('P99 TTFT(s)', justify='right')
121
+ table.add_column('Avg TPOT(s)', justify='right')
122
+ table.add_column('P99 TPOT(s)', justify='right')
123
+ table.add_column('Success Rate', justify='right', style='green')
124
+
125
+ # Add data rows
126
+ for row in summary:
127
+ try:
128
+ # Set row style based on success rate
129
+ success_rate = float(row[6].rstrip('%'))
130
+ row_style = 'green' if success_rate >= 95 else 'yellow' if success_rate >= 80 else 'red'
131
+
132
+ table.add_row(
133
+ str(row[0]), # Concurrency
134
+ f'{float(row[1]):.2f}', # RPS
135
+ f'{float(row[2]):.3f}', # Average Latency
136
+ f'{float(row[3]):.3f}', # P99 Latency
137
+ f'{float(row[4]):.2f}', # Average TPS
138
+ f'{float(row[5]):.3f}', # First Token Latency
139
+ f'{float(row[7]):.3f}', # P99 TTFT
140
+ f'{float(row[8]):.3f}', # Average TPOT
141
+ f'{float(row[9]):.3f}', # P99 TPOT
142
+ row[6], # Success Rate
143
+ style=row_style
144
+ )
145
+ except ValueError as e:
146
+ console.print(f'Warning: Error processing row data: {str(e)}', style='bold red')
147
+ continue
148
+
149
+ console.print('\n')
150
+ console.print(table)
151
+
152
+ # Calculate and display best performance configuration
153
+ try:
154
+ best_rps_idx = np.argmax([float(row[1]) if row[1] != 'N/A' else -1 for row in summary])
155
+ best_latency_idx = np.argmin([float(row[2]) if row[2] != 'N/A' else float('inf') for row in summary])
156
+
157
+ perf_info = Table(title='Best Performance Configuration', show_header=False, box=None, width=60)
158
+ perf_info.add_column('Metric', style='cyan', width=20)
159
+ perf_info.add_column('Value', style='green', width=40)
160
+
161
+ perf_info.add_row('Highest RPS', f'Concurrency {summary[best_rps_idx][0]} ({summary[best_rps_idx][1]} req/sec)')
162
+ perf_info.add_row(
163
+ 'Lowest Latency', f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)'
164
+ )
165
+
166
+ console.print('\n')
167
+ console.print(perf_info)
168
+
169
+ # Performance recommendations
170
+ recommendations = []
171
+ if best_rps_idx == len(summary) - 1:
172
+ recommendations.append(
173
+ 'The system seems not to have reached its performance bottleneck, try higher concurrency'
174
+ )
175
+ elif best_rps_idx == 0:
176
+ recommendations.append('Consider lowering concurrency, current load may be too high')
177
+ else:
178
+ recommendations.append(f'Optimal concurrency range is around {summary[best_rps_idx][0]}')
179
+
180
+ success_rate = float(summary[-1][6][:-1])
181
+ if success_rate < 95:
182
+ recommendations.append(
183
+ 'Success rate is low at high concurrency, check system resources or reduce concurrency'
184
+ )
185
+
186
+ recommend_text = Text('\nPerformance Recommendations:', style='bold cyan')
187
+ console.print(recommend_text)
188
+ for rec in recommendations:
189
+ console.print(f'• {rec}', style='yellow')
190
+
191
+ except Exception as e:
192
+ console.print(f'Warning: Error generating performance analysis: {str(e)}', style='bold red')