evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,125 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import time
5
+ import torch
6
+ from logging import getLogger
7
+ from typing import Any, Dict, List, Literal, Optional, Protocol, Tuple, Union, cast
8
+
9
+ from evalscope.api.messages import (
10
+ ChatMessage,
11
+ ChatMessageAssistant,
12
+ ContentAudio,
13
+ ContentImage,
14
+ ContentText,
15
+ ContentVideo,
16
+ )
17
+ from evalscope.api.model import (
18
+ ChatCompletionChoice,
19
+ GenerateConfig,
20
+ Logprob,
21
+ Logprobs,
22
+ ModelAPI,
23
+ ModelOutput,
24
+ ModelUsage,
25
+ TopLogprob,
26
+ )
27
+ from evalscope.api.tool import ToolChoice, ToolInfo
28
+ from evalscope.utils.io_utils import PIL_to_base64, base64_to_PIL
29
+ from evalscope.utils.model_utils import get_device
30
+
31
+ logger = getLogger()
32
+
33
+
34
+ class ImageEditAPI(ModelAPI):
35
+
36
+ def __init__(
37
+ self,
38
+ model_name: str,
39
+ base_url: Optional[str] = None,
40
+ api_key: Optional[str] = None,
41
+ config: GenerateConfig = GenerateConfig(),
42
+ **model_args: Any,
43
+ ):
44
+ super().__init__(
45
+ model_name=model_name,
46
+ base_url=base_url,
47
+ api_key=api_key,
48
+ config=config,
49
+ )
50
+
51
+ # collect known model_args (then delete them so we can pass the rest on)
52
+ def collect_model_arg(name: str) -> Optional[Any]:
53
+ nonlocal model_args
54
+ value = model_args.get(name, None)
55
+ if value is not None:
56
+ model_args.pop(name)
57
+ return value
58
+
59
+ model_path = collect_model_arg('model_path')
60
+ torch_dtype = collect_model_arg('precision') or collect_model_arg('torch_dtype')
61
+ device_map = collect_model_arg('device_map')
62
+ # torch dtype
63
+ DTYPE_MAP = {'float16': torch.float16, 'float32': torch.float32, 'bfloat16': torch.bfloat16, 'auto': 'auto'}
64
+
65
+ if isinstance(torch_dtype, str) and torch_dtype != 'auto':
66
+ torch_dtype = DTYPE_MAP.get(torch_dtype, torch.float32)
67
+ self.torch_dtype = torch_dtype
68
+ self.device = device_map or get_device()
69
+
70
+ self.pipeline_cls = collect_model_arg('pipeline_cls')
71
+ # default to DiffusionPipeline if not specified
72
+ if self.pipeline_cls is None:
73
+ if 'qwen' in model_name.lower():
74
+ self.pipeline_cls = 'QwenImageEditPipeline'
75
+ else:
76
+ logger.error('Pipeline class not found. Please provide a valid `pipeline_cls` in model args.')
77
+ raise ValueError('Invalid pipeline class.')
78
+
79
+ model_name_or_path = model_path or model_name
80
+
81
+ # from modelscope import pipeline_cls
82
+ module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
83
+ logger.info(f'Loading model {model_name_or_path} with {self.pipeline_cls} ...')
84
+
85
+ self.model = module.from_pretrained(
86
+ model_name_or_path,
87
+ torch_dtype=self.torch_dtype,
88
+ **model_args,
89
+ )
90
+
91
+ self.model.to(self.device)
92
+
93
+ def generate(
94
+ self,
95
+ input: List[ChatMessage],
96
+ tools: List[ToolInfo],
97
+ tool_choice: ToolChoice,
98
+ config: GenerateConfig,
99
+ ) -> ModelOutput:
100
+
101
+ # prepare generator
102
+ kwargs: Dict[str, Any] = {}
103
+ if config.num_inference_steps is not None:
104
+ kwargs['num_inference_steps'] = config.num_inference_steps
105
+ kwargs.update(config.model_extra)
106
+
107
+ # assume the first text as prompt
108
+ content = input[0].content
109
+ assert isinstance(content[0], ContentText) and isinstance(content[1], ContentImage), \
110
+ 'Invalid content types, expected (ContentText, ContentImage)'
111
+
112
+ prompt = content[0].text
113
+ input_image_base64 = content[1].image
114
+ input_image = base64_to_PIL(input_image_base64)
115
+ # get the first image as output
116
+ output = self.model(image=input_image, prompt=prompt, **kwargs)
117
+ image = output.images[0]
118
+
119
+ image_base64 = PIL_to_base64(image)
120
+
121
+ return ModelOutput(
122
+ model=self.model_name,
123
+ choices=[ChatCompletionChoice.from_content(content=[ContentImage(image=image_base64)])],
124
+ time=time.time(),
125
+ )
@@ -1,6 +1,7 @@
1
1
  from evalscope.api.model import ModelAPI
2
2
  from evalscope.api.registry import register_model_api
3
3
  from evalscope.utils.deprecation_utils import deprecated
4
+ from evalscope.utils.import_utils import check_import
4
5
 
5
6
 
6
7
  @register_model_api(name='mock_llm')
@@ -27,6 +28,8 @@ def server() -> type[ModelAPI]:
27
28
 
28
29
  @register_model_api(name='llm_ckpt')
29
30
  def llm_ckpt() -> type[ModelAPI]:
31
+ check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
32
+
30
33
  from .modelscope import ModelScopeAPI
31
34
 
32
35
  return ModelScopeAPI
@@ -35,6 +38,8 @@ def llm_ckpt() -> type[ModelAPI]:
35
38
  @register_model_api(name='checkpoint')
36
39
  @deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
37
40
  def checkpoint() -> type[ModelAPI]:
41
+ check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
42
+
38
43
  from .modelscope import ModelScopeAPI
39
44
 
40
45
  return ModelScopeAPI
@@ -42,6 +47,23 @@ def checkpoint() -> type[ModelAPI]:
42
47
 
43
48
  @register_model_api(name='text2image')
44
49
  def text2image() -> type[ModelAPI]:
50
+ check_import(['torch', 'torchvision', 'diffusers'],
51
+ package='evalscope[aigc]',
52
+ raise_error=True,
53
+ feature_name='text2image')
54
+
45
55
  from .text2image_model import Text2ImageAPI
46
56
 
47
57
  return Text2ImageAPI
58
+
59
+
60
+ @register_model_api(name='image_editing')
61
+ def image_editing() -> type[ModelAPI]:
62
+ check_import(['torch', 'torchvision', 'diffusers'],
63
+ package='evalscope[aigc]',
64
+ raise_error=True,
65
+ feature_name='image_editing')
66
+
67
+ from .image_edit_model import ImageEditAPI
68
+
69
+ return ImageEditAPI
@@ -8,6 +8,7 @@ from evalscope.api.messages import ChatMessage
8
8
  from evalscope.api.model import ChatCompletionChoice, GenerateConfig, ModelAPI, ModelOutput
9
9
  from evalscope.api.tool import ToolChoice, ToolInfo
10
10
  from evalscope.utils import get_logger
11
+ from evalscope.utils.argument_utils import get_supported_params
11
12
  from .utils.openai import (
12
13
  chat_choices_from_openai,
13
14
  collect_stream_response,
@@ -48,6 +49,9 @@ class OpenAICompatibleAPI(ModelAPI):
48
49
  self.base_url = base_url or os.environ.get('EVALSCOPE_BASE_URL', None)
49
50
  assert self.base_url, f'Base URL for {model_name} not found'
50
51
 
52
+ # remove trailing slash from base_url
53
+ self.base_url = self.base_url.rstrip('/').removesuffix('/chat/completions')
54
+
51
55
  # create http client
52
56
  self.client = OpenAI(
53
57
  api_key=self.api_key,
@@ -81,6 +85,8 @@ class OpenAICompatibleAPI(ModelAPI):
81
85
  **completion_params,
82
86
  )
83
87
 
88
+ self.validate_request_params(request)
89
+
84
90
  try:
85
91
  # generate completion and save response for model call
86
92
  completion = self.client.chat.completions.create(**request)
@@ -109,6 +115,21 @@ class OpenAICompatibleAPI(ModelAPI):
109
115
  tools=tools,
110
116
  )
111
117
 
118
+ def validate_request_params(self, params: Dict[str, Any]):
119
+ """Hook for subclasses to do custom request parameter validation."""
120
+ # Cache supported params to avoid repeated calls to inspect.signature.
121
+ if not hasattr(self, '_valid_params'):
122
+ self._valid_params = get_supported_params(self.client.chat.completions.create)
123
+
124
+ # Move unsupported parameters to extra_body.
125
+ extra_body = params.get('extra_body', {})
126
+ for key in list(params.keys()):
127
+ if key not in self._valid_params:
128
+ extra_body[key] = params.pop(key)
129
+
130
+ if extra_body:
131
+ params['extra_body'] = extra_body
132
+
112
133
  def on_response(self, response: Dict[str, Any]) -> None:
113
134
  """Hook for subclasses to do custom response handling."""
114
135
  pass
@@ -107,8 +107,8 @@ class Text2ImageAPI(ModelAPI):
107
107
  kwargs['num_inference_steps'] = config.num_inference_steps
108
108
  if config.guidance_scale is not None:
109
109
  kwargs['guidance_scale'] = config.guidance_scale
110
- if config.extra_body is not None:
111
- kwargs.update(config.extra_body)
110
+ # update with extra model parameters
111
+ kwargs.update(config.model_extra)
112
112
 
113
113
  # assume the first text as prompt
114
114
  prompt = input[0].text
@@ -104,10 +104,9 @@ def openai_chat_completion_part(content: Content) -> ChatCompletionContentPartPa
104
104
  )
105
105
  elif content.type == 'audio':
106
106
  audio_data_uri = file_as_data_uri(content.audio)
107
- audio_data = audio_data_uri.split('base64,')[1]
108
107
 
109
108
  return ChatCompletionContentPartInputAudioParam(
110
- type='input_audio', input_audio=dict(data=audio_data, format=content.format)
109
+ type='input_audio', input_audio=dict(data=audio_data_uri, format=content.format)
111
110
  )
112
111
 
113
112
  else:
@@ -175,6 +174,8 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
175
174
  params['stop'] = config.stop_seqs
176
175
  if config.presence_penalty is not None:
177
176
  params['presence_penalty'] = config.presence_penalty
177
+ if config.repetition_penalty is not None:
178
+ params['repetition_penalty'] = config.repetition_penalty
178
179
  if config.logit_bias is not None:
179
180
  params['logit_bias'] = config.logit_bias
180
181
  if config.seed is not None:
@@ -183,6 +184,8 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
183
184
  params['temperature'] = config.temperature
184
185
  if config.top_p is not None:
185
186
  params['top_p'] = config.top_p
187
+ if config.top_k is not None:
188
+ params['top_k'] = config.top_k
186
189
  if config.n is not None:
187
190
  params['n'] = config.n
188
191
  if config.logprobs is not None:
@@ -205,11 +208,15 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
205
208
  )
206
209
  if config.extra_body:
207
210
  params['extra_body'] = config.extra_body
211
+ if config.extra_query:
212
+ params['extra_query'] = config.extra_query
213
+ if config.extra_headers:
214
+ params['extra_headers'] = config.extra_headers
208
215
 
209
216
  return params
210
217
 
211
218
 
212
- def openai_assistant_content(message: ChatMessageAssistant) -> str:
219
+ def openai_assistant_content(message: ChatMessageAssistant, include_reasoning=True) -> str:
213
220
  # In agent bridge scenarios, we could encounter concepts such as reasoning and
214
221
  # .internal use in the ChatMessageAssistant that are not supported by the OpenAI
215
222
  # choices API. This code smuggles that data into the plain text so that it
@@ -220,7 +227,7 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
220
227
  else:
221
228
  content = ''
222
229
  for c in message.content:
223
- if c.type == 'reasoning':
230
+ if c.type == 'reasoning' and include_reasoning:
224
231
  attribs = ''
225
232
  if c.signature is not None:
226
233
  attribs = f'{attribs} signature="{c.signature}"'
@@ -239,11 +246,14 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
239
246
  return content
240
247
 
241
248
 
242
- def openai_chat_choices(choices: List[ChatCompletionChoice]) -> List[Choice]:
249
+ def openai_chat_choices(choices: List[ChatCompletionChoice], include_reasoning: bool = True) -> List[Choice]:
243
250
  oai_choices: List[Choice] = []
244
251
 
245
252
  for index, choice in enumerate(choices):
246
- content = openai_assistant_content(choice.message)
253
+ # Handle content
254
+ content = openai_assistant_content(choice.message, include_reasoning=include_reasoning)
255
+
256
+ # Handle tool calls
247
257
  if choice.message.tool_calls:
248
258
  tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
249
259
  else:
@@ -33,11 +33,17 @@ class Arguments(BaseArgument):
33
33
  rate: int = -1 # Rate limit for requests (default: -1, no limit)
34
34
  sleep_interval: int = 5 # Sleep interval between performance runs, in seconds
35
35
 
36
+ # Tuning knobs
37
+ db_commit_interval: int = 1000 # Number of rows buffered before committing to the DB
38
+ queue_size_multiplier: int = 5 # Maxsize for queue = parallel * this multiplier
39
+ in_flight_task_multiplier: int = 2 # Max scheduled tasks = parallel * this multiplier
40
+
36
41
  # Logging and debugging
37
42
  log_every_n_query: int = 10 # Log every N queries
38
43
  debug: bool = False # Debug mode
39
- wandb_api_key: Optional[str] = None # WandB API key for logging
40
- swanlab_api_key: Optional[str] = None # SwanLab API key for logging
44
+ visualizer: Optional[str] = None # Visualizer for logging, supports 'swanlab' or 'wandb'
45
+ wandb_api_key: Optional[str] = None # Will be deprecated in the future
46
+ swanlab_api_key: Optional[str] = None # Will be deprecated in the future
41
47
  name: Optional[str] = None # Name for the run
42
48
 
43
49
  # Output settings
@@ -55,6 +61,7 @@ class Arguments(BaseArgument):
55
61
  image_height: int = 224 # Height of the image for random VL dataset
56
62
  image_format: str = 'RGB' # Image format for random VL dataset
57
63
  image_num: int = 1 # Number of images for random VL dataset
64
+ image_patch_size: int = 28 # Patch size for image tokenizer, only for local image token calculation
58
65
 
59
66
  # Dataset settings
60
67
  dataset: str = 'openqa' # Dataset type (default: 'line_by_line')
@@ -67,7 +74,7 @@ class Arguments(BaseArgument):
67
74
  max_tokens: Optional[int] = 2048 # Maximum number of tokens in the response
68
75
  min_tokens: Optional[int] = None # Minimum number of tokens in the response
69
76
  n_choices: Optional[int] = None # Number of response choices
70
- seed: Optional[int] = 0 # Random seed for reproducibility
77
+ seed: Optional[int] = None # Random seed for reproducibility
71
78
  stop: Optional[List[str]] = None # Stop sequences for the response
72
79
  stop_token_ids: Optional[List[str]] = None # Stop token IDs for the response
73
80
  stream: Optional[bool] = True # Whether to stream the response
@@ -106,6 +113,14 @@ class Arguments(BaseArgument):
106
113
  self.parallel
107
114
  ), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}' # noqa: E501
108
115
 
116
+ # Validate tuning knobs
117
+ if self.db_commit_interval <= 0:
118
+ self.db_commit_interval = 1
119
+ if self.queue_size_multiplier <= 0:
120
+ self.queue_size_multiplier = 1
121
+ if self.in_flight_task_multiplier <= 0:
122
+ self.in_flight_task_multiplier = 1
123
+
109
124
 
110
125
  class ParseKVAction(argparse.Action):
111
126
 
@@ -151,9 +166,15 @@ def add_argument(parser: argparse.ArgumentParser):
151
166
  parser.add_argument(
152
167
  '--sleep-interval', type=int, default=5, help='Sleep interval between performance runs, in seconds. Default 5') # noqa: E501
153
168
 
169
+ # Tuning knobs
170
+ parser.add_argument('--db-commit-interval', type=int, default=1000, help='Rows buffered before SQLite commit')
171
+ parser.add_argument('--queue-size-multiplier', type=int, default=5, help='Queue maxsize = parallel * multiplier')
172
+ parser.add_argument('--in-flight-task-multiplier', type=int, default=2, help='Max scheduled tasks = parallel * multiplier') # noqa: E501
173
+
154
174
  # Logging and debugging
155
175
  parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
156
176
  parser.add_argument('--debug', action='store_true', default=False, help='Debug request send')
177
+ parser.add_argument('--visualizer', type=str, default=None, help='The visualizer to use, default None')
157
178
  parser.add_argument('--wandb-api-key', type=str, default=None, help='The wandb API key')
158
179
  parser.add_argument('--swanlab-api-key', type=str, default=None, help='The swanlab API key')
159
180
  parser.add_argument('--name', type=str, help='The wandb/swanlab db result name and result db name')
@@ -171,6 +192,7 @@ def add_argument(parser: argparse.ArgumentParser):
171
192
  parser.add_argument('--image-height', type=int, default=224, help='Height of the image for random VL dataset')
172
193
  parser.add_argument('--image-format', type=str, default='RGB', help='Image format for random VL dataset')
173
194
  parser.add_argument('--image-num', type=int, default=1, help='Number of images for random VL dataset')
195
+ parser.add_argument('--image-patch-size', type=int, default=28, help='Patch size for image tokenizer, only for local image token calculation') # noqa: E501
174
196
 
175
197
  # Output settings
176
198
  parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
@@ -188,7 +210,7 @@ def add_argument(parser: argparse.ArgumentParser):
188
210
  parser.add_argument(
189
211
  '--min-tokens', type=int, help='The minimum number of tokens that can be generated', default=None)
190
212
  parser.add_argument('--n-choices', type=int, help='How many completion choices to generate', default=None)
191
- parser.add_argument('--seed', type=int, help='The random seed', default=0)
213
+ parser.add_argument('--seed', type=int, help='The random seed', default=None)
192
214
  parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
193
215
  parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
194
216
  parser.add_argument('--stream', action=argparse.BooleanOptionalAction, help='Stream output with SSE', default=True)
@@ -3,8 +3,6 @@ import json
3
3
  import numpy as np
4
4
  import platform
5
5
  import sqlite3
6
- import time
7
- from http import HTTPStatus
8
6
  from tqdm import tqdm
9
7
  from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Tuple
10
8
 
@@ -42,6 +40,8 @@ async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGen
42
40
  try:
43
41
  for messages in message_generator.build_messages():
44
42
  dataset_messages.append(messages)
43
+ if len(dataset_messages) >= args.number:
44
+ break
45
45
  except StopIteration:
46
46
  pass
47
47
 
@@ -80,86 +80,58 @@ async def send_request(
80
80
  request: dict,
81
81
  benchmark_data_queue: asyncio.Queue,
82
82
  args: Arguments,
83
- api_plugin: 'ApiPluginBase',
83
+ client: AioHttpClient, # reuse shared client
84
84
  ):
85
85
  async with semaphore:
86
- client = AioHttpClient(args, api_plugin)
87
- async with client:
88
- benchmark_data = BenchmarkData(request=request)
89
- benchmark_data.start_time = time.perf_counter()
90
- collected_messages = []
91
- try:
92
- async for is_error, state_code, response_data in client.post(request):
93
- if is_error or state_code != HTTPStatus.OK:
94
- error_msg = str(response_data) if response_data else 'Unknown error'
95
- logger.error(f'Request: {request} failed, state_code: {state_code}, data: {error_msg}')
96
- benchmark_data.success = False
97
- break
98
- if response_data:
99
- collected_messages.append(response_data)
100
- benchmark_data.chunk_times.append(time.perf_counter())
101
- benchmark_data.success = True
102
- benchmark_data.update_gpu_usage()
103
- except Exception as e:
104
- if response_data:
105
- collected_messages.append(response_data)
106
- benchmark_data.success = False
107
- logger.exception(e)
108
- logger.error(f'Request query: {request} exception')
109
- finally:
110
- benchmark_data.completed_time = time.perf_counter()
111
- benchmark_data.response_messages = collected_messages
112
- await benchmark_data_queue.put(benchmark_data)
86
+ benchmark_data = await client.post(request)
87
+ benchmark_data.update_gpu_usage()
88
+ await benchmark_data_queue.put(benchmark_data)
113
89
 
114
90
 
115
91
  @exception_handler
116
92
  async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args: Arguments, api_plugin: 'ApiPluginBase'):
117
93
  metrics = BenchmarkMetrics(concurrency=args.parallel)
118
-
119
94
  result_db_path = get_result_db_path(args)
120
95
 
121
- collected_benchmark_data = []
122
-
123
- with tqdm(desc='Processing', total=args.number) as pbar:
124
- while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
125
- try:
126
- # Attempt to get benchmark data from the queue with a timeout
127
- benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
128
- benchmark_data_queue.task_done()
129
- except asyncio.TimeoutError:
130
- # If timeout, continue to the next iteration
131
- continue
132
-
133
- # Update metrics based on the benchmark data
134
- metrics.update_metrics(benchmark_data, api_plugin)
135
-
136
- # Collect benchmark data for later database insertion
137
- collected_benchmark_data.append(benchmark_data)
138
-
139
- # Create a message with the updated metrics
140
- message = metrics.create_message()
96
+ # Stream inserts to DB to avoid accumulating all results in memory
97
+ commit_every = args.db_commit_interval
98
+ processed_since_commit = 0
141
99
 
142
- # Log the message to wandb\swanlab if the api key is provided
143
- if args.wandb_api_key:
144
- import wandb
145
- wandb.log(message)
146
- if args.swanlab_api_key:
147
- import swanlab
148
- swanlab.log(message)
149
-
150
- # Log the message to the logger every n queries
151
- if int(metrics.n_total_queries) % args.log_every_n_query == 0:
152
- msg = json.dumps(message, ensure_ascii=False, indent=2)
153
- logger.info(msg)
154
-
155
- pbar.update(1) # Update the progress bar
156
-
157
- # Now perform database operations after all benchmark data has been processed
158
100
  with sqlite3.connect(result_db_path) as con:
159
101
  cursor = con.cursor()
160
102
  create_result_table(cursor)
161
- for benchmark_data in collected_benchmark_data:
162
- insert_benchmark_data(cursor, benchmark_data)
103
+
104
+ with tqdm(desc='Processing', total=args.number) as pbar:
105
+ while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
106
+ try:
107
+ benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.1)
108
+ except asyncio.TimeoutError:
109
+ continue
110
+
111
+ # Update metrics and write to DB immediately
112
+ metrics.update_metrics(benchmark_data, api_plugin)
113
+ insert_benchmark_data(cursor, benchmark_data)
114
+ processed_since_commit += 1
115
+ if processed_since_commit >= commit_every:
116
+ con.commit()
117
+ processed_since_commit = 0
118
+
119
+ message = metrics.create_message()
120
+
121
+ if args.wandb_api_key:
122
+ import wandb
123
+ wandb.log(message)
124
+ if args.swanlab_api_key:
125
+ import swanlab
126
+ swanlab.log(message)
127
+
128
+ if int(metrics.n_total_queries) % args.log_every_n_query == 0:
129
+ msg = json.dumps(message, ensure_ascii=False, indent=2)
130
+ logger.info(msg)
131
+
132
+ benchmark_data_queue.task_done()
133
+ pbar.update(1)
134
+
163
135
  con.commit()
164
136
 
165
137
  return metrics, result_db_path
@@ -177,31 +149,46 @@ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
177
149
  loop = asyncio.get_running_loop()
178
150
  add_signal_handlers(loop)
179
151
 
180
- # Create API plugin instance for request/response processing
181
152
  api_plugin_class = ApiRegistry.get_class(args.api)
182
153
  api_plugin = api_plugin_class(args)
183
154
 
184
- # init queue
185
- benchmark_data_queue = asyncio.Queue()
186
- # reset event
155
+ benchmark_data_queue: asyncio.Queue = asyncio.Queue(maxsize=max(1, args.parallel * args.queue_size_multiplier))
187
156
  data_process_completed_event.clear()
157
+
188
158
  # test connection
189
159
  await connect_test(args, api_plugin)
190
- # start statistic benchmark metric
191
- statistic_benchmark_metric_task = asyncio.create_task(
192
- statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
193
- )
194
- # start send request
195
- semaphore = asyncio.Semaphore(args.parallel)
196
- send_request_tasks: List[asyncio.Task] = []
197
- async for request in get_requests(args, api_plugin):
198
- task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, api_plugin))
199
- send_request_tasks.append(task)
200
-
201
- await asyncio.gather(*send_request_tasks, return_exceptions=True)
202
- await benchmark_data_queue.join()
203
- data_process_completed_event.set()
204
-
205
- metrics, result_db_path = await statistic_benchmark_metric_task
160
+
161
+ # Create a single shared client session for all requests
162
+ client = AioHttpClient(args, api_plugin)
163
+ async with client:
164
+ # start statistic benchmark metric (consumer)
165
+ statistic_benchmark_metric_task = asyncio.create_task(
166
+ statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
167
+ )
168
+
169
+ # start sending requests with bounded in-flight tasks
170
+ semaphore = asyncio.Semaphore(args.parallel)
171
+ in_flight: set[asyncio.Task] = set()
172
+ max_in_flight = args.parallel * args.in_flight_task_multiplier
173
+
174
+ async for request in get_requests(args, api_plugin):
175
+ # Keep the number of scheduled tasks bounded to avoid OOM
176
+ if len(in_flight) >= max_in_flight:
177
+ done, pending = await asyncio.wait(in_flight, return_when=asyncio.FIRST_COMPLETED)
178
+ in_flight = pending
179
+
180
+ task = asyncio.create_task(send_request(semaphore, request, benchmark_data_queue, args, client))
181
+ in_flight.add(task)
182
+
183
+ # Wait for remaining in-flight tasks
184
+ if in_flight:
185
+ await asyncio.gather(*in_flight, return_exceptions=True)
186
+
187
+ # Drain queue and finish
188
+ await benchmark_data_queue.join()
189
+ data_process_completed_event.set()
190
+
191
+ metrics, result_db_path = await statistic_benchmark_metric_task
192
+
206
193
  metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
207
194
  return metrics_result, percentile_result