evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -3,6 +3,7 @@ import asyncio
3
3
  import time
4
4
  from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
5
5
 
6
+ from evalscope.perf.utils.benchmark_util import BenchmarkData
6
7
  from evalscope.utils.logger import get_logger
7
8
  from .arguments import Arguments
8
9
 
@@ -24,7 +25,22 @@ class AioHttpClient:
24
25
  self.read_timeout = args.read_timeout
25
26
  self.connect_timeout = args.connect_timeout
26
27
  self.api_plugin = api_plugin
28
+
29
+ # Configure connector similar to vLLM bench for better TTFT under load.
30
+ connector = aiohttp.TCPConnector(
31
+ limit=args.parallel or 0, # 0 means no limit in aiohttp; use parallel as limit if set
32
+ limit_per_host=args.parallel or 0,
33
+ ttl_dns_cache=300,
34
+ use_dns_cache=True,
35
+ keepalive_timeout=60,
36
+ enable_cleanup_closed=True,
37
+ force_close=False,
38
+ ssl=('https://' in self.url),
39
+ )
40
+
27
41
  self.client = aiohttp.ClientSession(
42
+ connector=connector,
43
+ trust_env=True,
28
44
  timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
29
45
  trace_configs=[self._create_trace_config()] if args.debug else []
30
46
  )
@@ -43,23 +59,25 @@ class AioHttpClient:
43
59
  trace_config.on_response_chunk_received.append(self.on_response_chunk_received)
44
60
  return trace_config
45
61
 
46
- async def post(self, body):
47
- """Send POST request and delegate response handling to API plugin.
48
- Yields:
49
- Tuple[bool, int, str]: (is_error, status_code, response_data)
62
+ async def post(self, body) -> BenchmarkData:
63
+ """
64
+ Send POST request and delegate response handling to API plugin.
65
+
66
+ Returns:
67
+ BenchmarkData: The benchmark data object containing request and response information.
50
68
  """
51
69
  try:
52
70
  # Delegate the request processing to the API plugin
53
- async for result in self.api_plugin.process_request(self.client, self.url, self.headers, body):
54
- yield result
71
+ output = await self.api_plugin.process_request(self.client, self.url, self.headers, body)
72
+ return output
55
73
  except asyncio.TimeoutError as e:
56
74
  logger.error(
57
75
  f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longer timeout.' # noqa: E501
58
76
  )
59
- yield (True, None, str(e))
77
+ return BenchmarkData(success=False, error=str(e))
60
78
  except (aiohttp.ClientConnectorError, Exception) as e:
61
79
  logger.error(e)
62
- yield (True, None, str(e))
80
+ return BenchmarkData(success=False, error=str(e))
63
81
 
64
82
  @staticmethod
65
83
  async def on_request_start(session, context, params: aiohttp.TraceRequestStartParams):
@@ -91,7 +109,6 @@ class AioHttpClient:
91
109
 
92
110
 
93
111
  async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
94
- is_error = True
95
112
  start_time = time.perf_counter()
96
113
 
97
114
  async def attempt_connection():
@@ -100,18 +117,16 @@ async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
100
117
  messages = [{'role': 'user', 'content': 'hello'}] if args.apply_chat_template else 'hello'
101
118
  request = api_plugin.build_request(messages)
102
119
 
103
- async for is_error, state_code, response_data in client.post(request):
104
- return is_error, state_code, response_data
120
+ output = await client.post(request)
121
+ return output
105
122
 
106
123
  while True:
107
124
  try:
108
- is_error, state_code, response_data = await asyncio.wait_for(
109
- attempt_connection(), timeout=args.connect_timeout
110
- )
111
- if not is_error:
125
+ output = await asyncio.wait_for(attempt_connection(), timeout=args.connect_timeout)
126
+ if output.success:
112
127
  logger.info('Test connection successful.')
113
128
  return True
114
- logger.warning(f'Retrying... <{state_code}> {response_data}')
129
+ logger.warning(f'Retrying... <{output.error}>')
115
130
  except Exception as e:
116
131
  logger.warning(f'Retrying... <{e}>')
117
132
 
evalscope/perf/main.py CHANGED
@@ -4,7 +4,9 @@ import os
4
4
  import platform
5
5
  import threading
6
6
  import time
7
+ import warnings
7
8
  from argparse import Namespace
9
+ from logging import warn
8
10
 
9
11
  from evalscope.perf.utils.local_server import start_app
10
12
  from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
@@ -79,9 +81,20 @@ def run_perf_benchmark(args):
79
81
  configure_logging(args.debug, os.path.join(output_path, 'benchmark.log'))
80
82
 
81
83
  # Initialize wandb and swanlab
82
- if args.wandb_api_key:
84
+ visualizer = args.visualizer
85
+ if visualizer is None:
86
+ if args.wandb_api_key is not None:
87
+ visualizer = 'wandb'
88
+ warnings.warn('--wandb-api-key is deprecated. Please use `--visualizer wandb` instead.', DeprecationWarning)
89
+ elif args.swanlab_api_key is not None:
90
+ visualizer = 'swanlab'
91
+ warnings.warn(
92
+ '--swanlab-api-key is deprecated. Please use `--visualizer swanlab` instead.', DeprecationWarning
93
+ )
94
+ args.visualizer = visualizer
95
+ if visualizer == 'wandb':
83
96
  init_wandb(args)
84
- if args.swanlab_api_key:
97
+ elif visualizer == 'swanlab':
85
98
  init_swanlab(args)
86
99
 
87
100
  # Initialize local server if needed
@@ -3,6 +3,7 @@ from abc import abstractmethod
3
3
  from typing import Any, AsyncGenerator, Dict, List, Tuple
4
4
 
5
5
  from evalscope.perf.arguments import Arguments
6
+ from evalscope.perf.utils.benchmark_util import BenchmarkData
6
7
 
7
8
 
8
9
  class ApiPluginBase:
@@ -28,13 +29,13 @@ class ApiPluginBase:
28
29
  raise NotImplementedError
29
30
 
30
31
  @abstractmethod
31
- def parse_responses(self, responses: List, request: Any = None, **kwargs: Any) -> Tuple[int, int]:
32
+ def parse_responses(self, responses: List[Dict], request: str = None, **kwargs: Any) -> Tuple[int, int]:
32
33
  """Parser responses and return number of request and response tokens.
33
34
 
34
35
  Args:
35
- responses (List[bytes]): List of http response body, for stream output,
36
+ responses (List[Dict]): List of http response body, for stream output,
36
37
  there are multiple responses, each is bytes, for general only one.
37
- request (Any): The request body.
38
+ request (str): The json string of request.
38
39
 
39
40
  Returns:
40
41
  Tuple: (Number of prompt_tokens and number of completion_tokens).
@@ -42,8 +43,9 @@ class ApiPluginBase:
42
43
  raise NotImplementedError
43
44
 
44
45
  @abstractmethod
45
- async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
46
- body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
46
+ async def process_request(
47
+ self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
48
+ ) -> BenchmarkData:
47
49
  """Process the HTTP request and handle the response.
48
50
 
49
51
  Args:
@@ -52,8 +54,8 @@ class ApiPluginBase:
52
54
  headers: The request headers
53
55
  body: The request body
54
56
 
55
- Yields:
56
- Tuple[bool, int, str]: (is_error, status_code, response_data)
57
+ Returns:
58
+ BenchmarkData: The benchmark data including response and timing info.
57
59
  """
58
60
  raise NotImplementedError
59
61
 
@@ -5,6 +5,7 @@ from typing import Any, AsyncGenerator, Dict, List, Tuple, Union
5
5
  from evalscope.perf.arguments import Arguments
6
6
  from evalscope.perf.plugin.api.base import ApiPluginBase
7
7
  from evalscope.perf.plugin.registry import register_api
8
+ from evalscope.perf.utils.benchmark_util import BenchmarkData
8
9
  from evalscope.utils.logger import get_logger
9
10
 
10
11
  logger = get_logger()
@@ -98,7 +99,7 @@ class CustomPlugin(ApiPluginBase):
98
99
 
99
100
  return payload
100
101
 
101
- def parse_responses(self, responses: List[str], request: Any = None, **kwargs) -> Tuple[int, int]:
102
+ def parse_responses(self, responses: List[Dict], request: str = None, **kwargs) -> Tuple[int, int]:
102
103
  """Parse API responses and return token counts.
103
104
 
104
105
  This method extracts the number of input and output tokens from the API responses.
@@ -106,8 +107,8 @@ class CustomPlugin(ApiPluginBase):
106
107
  to calculate it using a tokenizer.
107
108
 
108
109
  Args:
109
- responses (List[str]): List of API response strings.
110
- request (Any, optional): The original request, which might be needed for token calculation.
110
+ responses (List[Dict]): List of API response strings.
111
+ request (str, optional): The original request, which might be needed for token calculation.
111
112
  **kwargs: Additional arguments.
112
113
 
113
114
  Returns:
@@ -160,8 +161,9 @@ class CustomPlugin(ApiPluginBase):
160
161
  logger.error(f'Error parsing responses: {e}')
161
162
  return 0, 0
162
163
 
163
- async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
164
- body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
164
+ async def process_request(
165
+ self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
166
+ ) -> BenchmarkData:
165
167
  """Process the HTTP request and handle the response.
166
168
 
167
169
  This method handles sending the request to your API and processing the response,
@@ -173,60 +175,13 @@ class CustomPlugin(ApiPluginBase):
173
175
  headers (Dict): The request headers.
174
176
  body (Dict): The request body.
175
177
 
176
- Yields:
177
- Tuple[bool, int, str]: (is_error, status_code, response_data)
178
- - is_error: Whether the response indicates an error
179
- - status_code: HTTP status code
180
- - response_data: Response content
178
+ Returns:
179
+ BenchmarkData: The benchmark data including response and timing info.
181
180
  """
182
- try:
183
- # Set content type header
184
- headers = {'Content-Type': 'application/json', **headers}
185
-
186
- # Convert body to JSON
187
- data = json.dumps(body, ensure_ascii=False)
188
-
189
- # Send the request
190
- async with client_session.request('POST', url=url, data=data, headers=headers) as response: # noqa: E125
191
- # Get the status code
192
- status_code = response.status
193
-
194
- # Check if it's a streaming response
195
- if 'text/event-stream' in response.content_type:
196
- # Handle streaming response
197
- async for line in response.content:
198
- line_str = line.decode('utf-8').strip()
199
- if not line_str:
200
- continue
201
-
202
- # Check for data prefix in server-sent events
203
- if line_str.startswith('data: '):
204
- data = line_str[6:] # Remove 'data: ' prefix
205
-
206
- # Check if it's the end of the stream
207
- if data == '[DONE]':
208
- break
209
-
210
- try:
211
- # Parse the JSON data
212
- parsed_data = json.loads(data)
213
- yield (False, status_code, json.dumps(parsed_data))
214
- except json.JSONDecodeError:
215
- yield (True, status_code, f'Failed to parse JSON: {data}')
216
- else:
217
- # Handle regular response
218
- if 'application/json' in response.content_type:
219
- # JSON response
220
- content = await response.json()
221
- yield (status_code >= 400, status_code, json.dumps(content))
222
- else:
223
- # Text response
224
- content = await response.text()
225
- yield (status_code >= 400, status_code, content)
226
-
227
- except Exception as e:
228
- logger.error(f'Error in process_request: {e}')
229
- yield (True, 500, str(e))
181
+ raise NotImplementedError(
182
+ 'The `process_request` method must be implemented in a subclass. '
183
+ 'For OpenAI-compatible APIs, consider inheriting from `DefaultApiPlugin` to reuse the default implementation.' # noqa: E501
184
+ )
230
185
 
231
186
 
232
187
  if __name__ == '__main__':
@@ -1,24 +1,77 @@
1
1
  import aiohttp
2
+ import codecs
2
3
  import json
3
- from http import HTTPStatus
4
- from typing import Any, AsyncGenerator, Dict, List, Tuple
4
+ import sys
5
+ import time
6
+ import traceback
7
+ from typing import Any, Dict
5
8
 
6
9
  from evalscope.perf.arguments import Arguments
7
10
  from evalscope.perf.plugin.api.base import ApiPluginBase
8
- from evalscope.perf.utils.local_server import ServerSentEvent
11
+ from evalscope.perf.utils.benchmark_util import BenchmarkData
9
12
  from evalscope.utils.logger import get_logger
10
13
 
11
14
  logger = get_logger()
12
15
 
13
16
 
17
+ class StreamedResponseHandler:
18
+ """Handles streaming HTTP responses by accumulating chunks until complete
19
+ messages are available."""
20
+
21
+ def __init__(self):
22
+ self.buffer = ''
23
+ # Keep decoder state across chunks to handle split multibyte sequences
24
+ self.decoder = codecs.getincrementaldecoder('utf-8')()
25
+
26
+ def add_chunk(self, chunk_bytes: bytes) -> list[str]:
27
+ """Add a chunk of bytes to the buffer and return any complete
28
+ messages."""
29
+ # Use incremental decoding so incomplete multibyte sequences don't error
30
+ try:
31
+ chunk_str = self.decoder.decode(chunk_bytes, final=False)
32
+ except UnicodeDecodeError:
33
+ # Bad bytes: drop them and reset decoder state to avoid corruption
34
+ self.decoder.reset()
35
+ chunk_str = chunk_bytes.decode('utf-8', errors='ignore')
36
+ self.buffer += chunk_str
37
+
38
+ messages = []
39
+
40
+ # Split by double newlines (SSE message separator)
41
+ while '\n\n' in self.buffer:
42
+ message, self.buffer = self.buffer.split('\n\n', 1)
43
+ message = message.strip()
44
+ if message:
45
+ messages.append(message)
46
+
47
+ # if self.buffer is not empty, check if it is a complete message
48
+ # by removing data: prefix and check if it is a valid JSON
49
+ if self.buffer.startswith('data: '):
50
+ message_content = self.buffer.removeprefix('data: ').strip()
51
+ if message_content == '[DONE]':
52
+ messages.append(self.buffer.strip())
53
+ self.buffer = ''
54
+ elif message_content:
55
+ try:
56
+ json.loads(message_content)
57
+ messages.append(self.buffer.strip())
58
+ self.buffer = ''
59
+ except json.JSONDecodeError:
60
+ # Incomplete JSON, wait for more chunks.
61
+ pass
62
+
63
+ return messages
64
+
65
+
14
66
  class DefaultApiPlugin(ApiPluginBase):
15
67
  """Default implementation of API plugin with common HTTP handling methods."""
16
68
 
17
69
  def __init__(self, param: Arguments):
18
70
  super().__init__(param)
19
71
 
20
- async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
21
- body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
72
+ async def process_request(
73
+ self, client_session: aiohttp.ClientSession, url: str, headers: Dict, body: Dict
74
+ ) -> BenchmarkData:
22
75
  """Process the HTTP request and handle the response.
23
76
 
24
77
  Args:
@@ -27,79 +80,135 @@ class DefaultApiPlugin(ApiPluginBase):
27
80
  headers: The request headers
28
81
  body: The request body
29
82
 
30
- Yields:
31
- Tuple[bool, int, str]: (is_error, status_code, response_data)
32
- """
33
- try:
34
- headers = {'Content-Type': 'application/json', **headers}
35
- data = json.dumps(body, ensure_ascii=False) # serialize to JSON
36
- async with client_session.request('POST', url=url, data=data, headers=headers) as response:
37
- async for result in self._handle_response(response):
38
- yield result
39
- except Exception as e:
40
- logger.error(f'Error in process_request: {e}')
41
- yield (True, None, str(e))
42
-
43
- async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
44
- """Handle streaming response from server-sent events.
45
-
46
- Args:
47
- response: The aiohttp response object containing a stream
48
-
49
- Yields:
50
- Tuple[bool, int, Any]: (is_error, status_code, data)
83
+ Returns:
84
+ BenchmarkData: Aggregated benchmarking data for the request/response.
51
85
  """
86
+ headers = {'Content-Type': 'application/json', **headers}
87
+ data = json.dumps(body, ensure_ascii=False) # serialize to JSON
88
+
89
+ output = BenchmarkData()
90
+ ttft = 0.0
91
+ generated_text = ''
92
+ st = time.perf_counter()
93
+ output.start_time = st
94
+ output.request = data
95
+ most_recent_timestamp = st
52
96
  try:
53
- async for chunk_bytes in response.content:
54
- chunk_bytes = chunk_bytes.strip()
55
- if not chunk_bytes:
56
- continue
57
- chunk_bytes = chunk_bytes.decode('utf-8')
58
- # NOTE: SSE comments (often used as pings) start with a colon.
59
- # These are not JSON data payload and should be skipped.
60
- if chunk_bytes.startswith(':'):
61
- continue
62
-
63
- chunk = chunk_bytes.removeprefix('data: ')
64
-
65
- if chunk != '[DONE]':
66
- data = json.loads(chunk)
67
-
68
- yield False, response.status, data
69
-
70
- except Exception as e:
71
- logger.error(f'Error in _handle_stream: {e}')
72
- yield True, response.status, str(e)
73
-
74
- async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
75
- """Handle the HTTP response based on content type and status.
76
-
77
- Args:
78
- response: The aiohttp response object
79
-
80
- Yields:
81
- Tuple[bool, int, str]: (is_error, status_code, response_data)
82
- """
83
- response_status = response.status
84
- response_content_type = response.content_type
85
- content_type_json = 'application/json'
86
- content_type_stream = 'text/event-stream'
87
- is_success = (response_status == HTTPStatus.OK)
88
-
89
- if is_success:
90
- # Handle successful response with 'text/event-stream' content type
91
- if content_type_stream in response_content_type:
92
- async for is_error, response_status, content in self._handle_stream(response):
93
- yield (is_error, response_status, content)
94
- # Handle successful response with 'application/json' content type
95
- elif content_type_json in response_content_type:
96
- content = await response.json()
97
- yield (False, response_status, json.dumps(content, ensure_ascii=False))
98
- # Handle other successful responses
99
- else:
100
- content = await response.read()
101
- yield (False, response_status, content.decode('utf-8'))
102
- else:
103
- # error is always in JSON format
104
- error = await response.json()
105
- yield (True, response_status, json.dumps(error, ensure_ascii=False))
97
+ async with client_session.post(url=url, data=data, headers=headers) as response:
98
+ content_type = response.headers.get('Content-Type', '')
99
+ if response.status == 200:
100
+ # Handle streaming responses (SSE)
101
+ if 'text/event-stream' in content_type:
102
+ handler = StreamedResponseHandler()
103
+ async for chunk_bytes in response.content.iter_any():
104
+
105
+ if not chunk_bytes:
106
+ continue
107
+
108
+ messages = handler.add_chunk(chunk_bytes)
109
+ for message in messages:
110
+ # NOTE: SSE comments (often used as pings) start with
111
+ # a colon. These are not JSON data payload and should
112
+ # be skipped.
113
+ if message.startswith(':'):
114
+ continue
115
+
116
+ chunk = message.removeprefix('data: ')
117
+
118
+ if chunk != '[DONE]':
119
+ timestamp = time.perf_counter()
120
+ data = json.loads(chunk)
121
+
122
+ if choices := data.get('choices'):
123
+ content = choices[0]['delta'].get('content')
124
+ # First token
125
+ if ttft == 0.0:
126
+ ttft = timestamp - st
127
+ output.first_chunk_latency = ttft
128
+
129
+ # Decoding phase
130
+ else:
131
+ output.inter_chunk_latency.append(timestamp - most_recent_timestamp)
132
+
133
+ generated_text += content or ''
134
+ output.response_messages.append(data)
135
+ elif usage := data.get('usage'):
136
+ output.prompt_tokens = usage.get('prompt_tokens')
137
+ output.completion_tokens = usage.get('completion_tokens')
138
+
139
+ most_recent_timestamp = timestamp
140
+
141
+ output.generated_text = generated_text
142
+ output.success = True
143
+ output.completed_time = most_recent_timestamp
144
+ output.query_latency = most_recent_timestamp - st
145
+
146
+ # Handle non-stream JSON responses
147
+ elif 'application/json' in content_type or 'application/' in content_type:
148
+ payload: Any
149
+ try:
150
+ payload = await response.json()
151
+ except Exception:
152
+ # Fallback to text if JSON parsing fails
153
+ payload = await response.text()
154
+
155
+ timestamp = time.perf_counter()
156
+ output.completed_time = timestamp
157
+ output.query_latency = timestamp - st
158
+ # For non-stream, first chunk equals full latency
159
+ output.first_chunk_latency = output.query_latency
160
+
161
+ if isinstance(payload, dict):
162
+ # Extract generated text from choices
163
+ text = ''
164
+ if choices := payload.get('choices'):
165
+ first = choices[0] if choices else {}
166
+ # Chat Completions format
167
+ msg = first.get('message') or {}
168
+ if isinstance(msg, dict) and msg.get('content') is not None:
169
+ text = msg.get('content') or ''
170
+ else:
171
+ # Legacy Completions format
172
+ text = first.get('text') or ''
173
+ generated_text = text
174
+
175
+ # Extract usage if provided
176
+ if usage := payload.get('usage'):
177
+ output.prompt_tokens = usage.get('prompt_tokens')
178
+ output.completion_tokens = usage.get('completion_tokens')
179
+
180
+ output.response_messages.append(payload)
181
+ else:
182
+ generated_text = str(payload)
183
+
184
+ output.generated_text = generated_text
185
+ output.success = True
186
+
187
+ else:
188
+ # Unknown successful content-type: read as text
189
+ raw = await response.text()
190
+ timestamp = time.perf_counter()
191
+ output.completed_time = timestamp
192
+ output.query_latency = timestamp - st
193
+ output.first_chunk_latency = output.query_latency
194
+ output.generated_text = raw
195
+ output.response_messages.append(raw)
196
+ output.success = True
197
+ else:
198
+ # Try to parse structured error, fallback to reason/text
199
+ try:
200
+ err_payload = await response.json()
201
+ output.error = json.dumps(err_payload, ensure_ascii=False)
202
+ except Exception:
203
+ try:
204
+ output.error = await response.text()
205
+ except Exception:
206
+ output.error = response.reason or ''
207
+ output.success = False
208
+ except Exception:
209
+ output.success = False
210
+ exc_info = sys.exc_info()
211
+ output.error = ''.join(traceback.format_exception(*exc_info))
212
+ logger.error(output.error)
213
+
214
+ return output