evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,10 +1,13 @@
1
1
  import json
2
+ import math
2
3
  import os
4
+ from collections import defaultdict
3
5
  from typing import Any, Dict, List, Tuple, Union
4
6
 
5
7
  from evalscope.perf.arguments import Arguments
6
8
  from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
7
9
  from evalscope.perf.plugin.registry import register_api
10
+ from evalscope.utils.io_utils import base64_to_PIL
8
11
  from evalscope.utils.logger import get_logger
9
12
 
10
13
  logger = get_logger()
@@ -99,7 +102,7 @@ class OpenaiPlugin(DefaultApiPlugin):
99
102
  payload.update(param.extra_args)
100
103
  return payload
101
104
 
102
- def parse_responses(self, responses, request: Any = None, **kwargs) -> tuple[int, int]:
105
+ def parse_responses(self, responses: List[Dict], request: str = None, **kwargs) -> tuple[int, int]:
103
106
  """Parser responses and return number of request and response tokens.
104
107
  Only one response for non-stream, multiple responses for stream.
105
108
  """
@@ -113,7 +116,7 @@ class OpenaiPlugin(DefaultApiPlugin):
113
116
  return input_tokens, output_tokens
114
117
 
115
118
  # no usage information in the response, parse the response to get the tokens
116
- delta_contents = {}
119
+ delta_contents = defaultdict(list)
117
120
  for response in responses:
118
121
  if 'object' in response:
119
122
  self.__process_response_object(response, delta_contents)
@@ -123,41 +126,46 @@ class OpenaiPlugin(DefaultApiPlugin):
123
126
  input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
124
127
  return input_tokens, output_tokens
125
128
 
126
- def __process_response_object(self, js, delta_contents):
127
- if js['object'] == 'chat.completion':
128
- for choice in js['choices']:
129
+ def __process_response_object(self, response, delta_contents):
130
+ if not response.get('choices'):
131
+ return
132
+ if response['object'] == 'chat.completion':
133
+ for choice in response['choices']:
129
134
  delta_contents[choice['index']] = [choice['message']['content']]
130
- elif js['object'] == 'text_completion':
131
- for choice in js['choices']:
132
- delta_contents[choice['index']] = [choice['text']]
133
- elif js['object'] == 'chat.completion.chunk':
134
- for choice in js.get('choices', []):
135
+ elif response['object'] == 'text_completion':
136
+ for choice in response['choices']:
137
+ if 'text' in choice and 'index' in choice:
138
+ delta_contents[choice['index']].append(choice['text'])
139
+ elif response['object'] == 'chat.completion.chunk':
140
+ for choice in response['choices']:
135
141
  if 'delta' in choice and 'index' in choice:
136
142
  delta = choice['delta']
137
143
  idx = choice['index']
138
144
  if 'content' in delta:
139
- delta_content = delta['content']
140
- delta_contents.setdefault(idx, []).append(delta_content)
145
+ delta_contents[idx].append(delta['content'])
141
146
 
142
- def __process_no_object(self, js, delta_contents):
147
+ def __process_no_object(self, response, delta_contents):
143
148
  # assume the response is a single choice
144
- for choice in js['choices']:
149
+ if not response.get('choices'):
150
+ return
151
+ for choice in response['choices']:
145
152
  if 'delta' in choice:
146
153
  delta = choice['delta']
147
154
  idx = choice['index']
148
155
  if 'content' in delta:
149
- delta_content = delta['content']
150
- delta_contents.setdefault(idx, []).append(delta_content)
156
+ delta_contents[idx].append(delta['content'])
151
157
  else:
152
158
  delta_contents[choice['index']] = [choice['message']['content']]
153
159
 
154
- def __calculate_tokens_from_content(self, request, delta_contents):
160
+ def __calculate_tokens_from_content(self, request, content):
155
161
  input_tokens = output_tokens = 0
156
162
  if self.tokenizer is not None:
157
- for idx, choice_contents in delta_contents.items():
163
+ # Calculate input tokens
164
+ input_tokens += self._count_input_tokens(request)
165
+ for idx, choice_contents in content.items():
158
166
  full_response_content = ''.join(choice_contents)
159
- input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
160
- output_tokens += len(self.tokenizer.encode(full_response_content))
167
+ # Calculate output tokens
168
+ output_tokens += self._count_output_tokens(full_response_content)
161
169
  else:
162
170
  raise ValueError(
163
171
  'Error: Unable to retrieve usage information\n\n'
@@ -171,3 +179,60 @@ class OpenaiPlugin(DefaultApiPlugin):
171
179
  'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .'
172
180
  )
173
181
  return input_tokens, output_tokens
182
+
183
+ def _count_input_tokens(self, request_str: str) -> int:
184
+ """Count the number of input tokens in the request.
185
+
186
+ This method handles different types of requests and calculates tokens for:
187
+ - Text content in messages or prompts
188
+ - Images in multimodal messages (converted to patch tokens)
189
+
190
+ Args:
191
+ request_str (str): The request json str containing either 'messages' for chat
192
+ completion or 'prompt' for text completion.
193
+
194
+ Returns:
195
+ int: The total number of input tokens including text and image tokens.
196
+ """
197
+ input_tokens = 0
198
+ request = json.loads(request_str)
199
+ if 'messages' in request:
200
+ input_content = self.tokenizer.apply_chat_template(
201
+ request['messages'], tokenize=True, add_generation_prompt=True
202
+ )
203
+ input_tokens += len(input_content)
204
+ # handle image tokens if any
205
+ for message in request['messages']:
206
+ content = message.get('content', '')
207
+ if isinstance(content, str):
208
+ continue
209
+ for cont in content:
210
+ if cont['type'] == 'image_url':
211
+ try:
212
+ # assuming image_url is base64 string
213
+ image_base64 = cont['image_url']['url']
214
+ image = base64_to_PIL(image_base64)
215
+ # Use math.ceil for more accurate token count when image dimensions
216
+ # aren't perfectly divisible by patch size
217
+ n_patches = (
218
+ math.ceil(image.height / self.param.image_patch_size)
219
+ * math.ceil(image.width / self.param.image_patch_size)
220
+ )
221
+ input_tokens += n_patches
222
+ except Exception as e:
223
+ logger.warning(f'Failed to process image for token counting: {e}')
224
+ # Continue processing other content without failing
225
+ elif 'prompt' in request:
226
+ input_tokens += len(self.tokenizer.encode(request['prompt'], add_special_tokens=False))
227
+ return input_tokens
228
+
229
+ def _count_output_tokens(self, response: str) -> int:
230
+ """Count the number of output tokens in the response. Only string response is supported.
231
+
232
+ Args:
233
+ response (str): The API response text.
234
+
235
+ Returns:
236
+ int: The number of output tokens.
237
+ """
238
+ return len(self.tokenizer.encode(response, add_special_tokens=False))
@@ -15,6 +15,11 @@ class DatasetPluginBase:
15
15
  dataset_path (str, optional): The input dataset path. Defaults to None.
16
16
  """
17
17
  self.query_parameters = query_parameters
18
+ if query_parameters.tokenizer_path:
19
+ from modelscope import AutoTokenizer
20
+ self.tokenizer = AutoTokenizer.from_pretrained(query_parameters.tokenizer_path, trust_remote_code=True)
21
+ else:
22
+ self.tokenizer = None
18
23
 
19
24
  def __next__(self):
20
25
  for item in self.build_messages():
@@ -85,3 +90,19 @@ class DatasetPluginBase:
85
90
  for url in image_urls:
86
91
  message['content'].append({'type': 'image_url', 'image_url': {'url': url}})
87
92
  return message
93
+
94
+ def check_prompt_length(self, prompt: str) -> Tuple[bool, int]:
95
+ """Check if the prompt length is within the specified range.
96
+
97
+ Args:
98
+ prompt (str): The input prompt string.
99
+
100
+ Returns:
101
+ Tuple[bool, int]: A tuple containing a boolean indicating whether the prompt is valid and its length.
102
+ """
103
+ if self.tokenizer is None:
104
+ prompt_length = len(prompt)
105
+ else:
106
+ prompt_length = len(self.tokenizer.encode(prompt))
107
+ is_valid = self.query_parameters.min_prompt_length <= prompt_length <= self.query_parameters.max_prompt_length
108
+ return is_valid, prompt_length
@@ -16,9 +16,8 @@ class CustomDatasetPlugin(DatasetPluginBase):
16
16
  def build_messages(self) -> Iterator[List[Dict]]:
17
17
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
18
18
  prompt = item.strip()
19
- if len(prompt) > self.query_parameters.min_prompt_length and len(
20
- prompt
21
- ) < self.query_parameters.max_prompt_length:
19
+ is_valid, _ = self.check_prompt_length(prompt)
20
+ if is_valid:
22
21
  if self.query_parameters.apply_chat_template:
23
22
  message = self.create_message(prompt)
24
23
  yield [message]
@@ -22,7 +22,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
22
22
  for item in dataset:
23
23
  pil_image = item['jpg']
24
24
  text = item['txt']
25
- base64_image = PIL_to_base64(pil_image)
25
+ base64_image = PIL_to_base64(pil_image, add_header=True)
26
26
 
27
- message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
27
+ message = self.create_message(text=text, image_urls=base64_image)
28
28
  yield [message]
@@ -22,7 +22,7 @@ class KontextDatasetPlugin(DatasetPluginBase):
22
22
  for item in dataset:
23
23
  pil_image = item['image']
24
24
  text = item['instruction']
25
- base64_image = PIL_to_base64(pil_image)
25
+ base64_image = PIL_to_base64(pil_image, add_header=True)
26
26
 
27
- message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
27
+ message = self.create_message(text=text, image_urls=base64_image)
28
28
  yield [message]
@@ -17,9 +17,8 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
17
17
  def build_messages(self) -> Iterator[List[Dict]]:
18
18
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
19
19
  prompt = item.strip()
20
- if len(prompt) > self.query_parameters.min_prompt_length and len(
21
- prompt
22
- ) < self.query_parameters.max_prompt_length:
20
+ is_valid, _ = self.check_prompt_length(prompt)
21
+ if is_valid:
23
22
  if self.query_parameters.apply_chat_template:
24
23
  message = self.create_message(prompt)
25
24
  yield [message]
@@ -22,9 +22,8 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
22
22
  ds = self.dataset_json_list(self.query_parameters.dataset_path)
23
23
  for item in ds:
24
24
  prompt = item['instruction'].strip()
25
- if len(prompt) > self.query_parameters.min_prompt_length and len(
26
- prompt
27
- ) < self.query_parameters.max_prompt_length:
25
+ is_valid, _ = self.check_prompt_length(prompt)
26
+ if is_valid:
28
27
  if self.query_parameters.apply_chat_template:
29
28
  message = self.create_message(prompt)
30
29
  yield [message]
@@ -27,10 +27,8 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
27
27
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
28
28
  item = json.loads(item)
29
29
  prompt = item['question'].strip()
30
- if (
31
- len(prompt) > self.query_parameters.min_prompt_length
32
- and len(prompt) < self.query_parameters.max_prompt_length
33
- ):
30
+ is_valid, _ = self.check_prompt_length(prompt)
31
+ if is_valid:
34
32
  if self.query_parameters.apply_chat_template:
35
33
  message = self.create_message(prompt)
36
34
  yield [message]
@@ -12,11 +12,9 @@ class RandomDatasetPlugin(DatasetPluginBase):
12
12
  """
13
13
 
14
14
  def __init__(self, query_parameters: Arguments):
15
+ assert query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer-path`.' # noqa: E501
15
16
  super().__init__(query_parameters)
16
- assert self.query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer_path`.' # noqa: E501
17
17
 
18
- from modelscope import AutoTokenizer
19
- self.tokenizer = AutoTokenizer.from_pretrained(self.query_parameters.tokenizer_path, trust_remote_code=True)
20
18
  self.prefix_length = self.query_parameters.prefix_length
21
19
  self.prefix_ids = self.get_random_inputs(self.prefix_length)
22
20
  self.template_len = self.get_template_len()
@@ -31,7 +31,7 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
31
31
  # Generate random images based on image_num
32
32
  images_b64 = []
33
33
  for _ in range(self.image_num):
34
- images_b64.append(f'data:image/png;base64,{self._generate_random_image_b64()}')
34
+ images_b64.append(self._generate_random_image_b64())
35
35
 
36
36
  message = self.create_message(text=prompt, image_urls=images_b64)
37
37
  yield [message]
@@ -77,4 +77,4 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
77
77
  draw.line(coords, fill=shape_color, width=random.randint(1, 5))
78
78
 
79
79
  # Convert to base64
80
- return PIL_to_base64(image, format='PNG')
80
+ return PIL_to_base64(image, format='PNG', add_header=True)
@@ -1,8 +1,7 @@
1
- import time
2
- import torch
3
1
  from dataclasses import dataclass, field
4
2
  from typing import Any, List, Optional, Tuple
5
3
 
4
+ from evalscope.utils.import_utils import check_import
6
5
  from evalscope.utils.logger import get_logger
7
6
 
8
7
  logger = get_logger()
@@ -10,7 +9,7 @@ logger = get_logger()
10
9
 
11
10
  @dataclass
12
11
  class BenchmarkData:
13
- request: Any = None
12
+ request: str = None # json serialized request body
14
13
  start_time: float = 0.0
15
14
  completed_time: float = 0.0
16
15
  chunk_times: List[float] = field(default_factory=list)
@@ -24,30 +23,34 @@ class BenchmarkData:
24
23
  time_per_output_token: float = 0.0
25
24
  inter_chunk_latency: List[float] = field(default_factory=list)
26
25
 
27
- prompt_tokens = None
28
- completion_tokens = None
29
-
30
- def _calculate_query_stream_metric(self) -> None:
31
- self.query_latency = self.completed_time - self.start_time
32
- # only for stream responses
33
- if len(self.chunk_times) > 1:
34
- self.first_chunk_latency = self.chunk_times[0] - self.start_time
35
- # remove the first chunk time from the total latency
36
- self.time_per_output_token = (self.query_latency - self.first_chunk_latency
37
- ) / (self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
38
- self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
39
- else:
40
- self.first_chunk_latency = self.query_latency
26
+ # response content
27
+ generated_text: str = ''
28
+ error: Optional[str] = None
29
+ prompt_tokens: Optional[int] = None
30
+ completion_tokens: Optional[int] = None
41
31
 
42
32
  def _calculate_tokens(self, api_plugin):
43
- self.prompt_tokens, self.completion_tokens = \
44
- api_plugin.parse_responses(self.response_messages, request=self.request)
33
+ if self.prompt_tokens is None or self.completion_tokens is None:
34
+ self.prompt_tokens, self.completion_tokens = api_plugin.parse_responses(
35
+ self.response_messages, request=self.request
36
+ )
37
+
38
+ # Calculate time per output token
39
+ if self.completion_tokens and self.completion_tokens > 1:
40
+ # tpot = (latency - ttft) / (output_len - 1)
41
+ self.time_per_output_token = (self.query_latency - self.first_chunk_latency) / (self.completion_tokens - 1)
42
+
43
+ # Ensure inter-chunk latency is available (compute from chunk_times if needed)
44
+ if not self.inter_chunk_latency and self.chunk_times:
45
+ self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
45
46
 
46
47
  def update_gpu_usage(self):
47
- total_memory = 0
48
- for i in range(torch.cuda.device_count()):
49
- total_memory += (torch.cuda.max_memory_allocated(i) / 2**30) # GB
50
- self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
48
+ if check_import('torch', raise_warning=False):
49
+ import torch
50
+ total_memory = 0
51
+ for i in range(torch.cuda.device_count()):
52
+ total_memory += (torch.cuda.max_memory_allocated(i) / 2**30) # GB
53
+ self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
51
54
 
52
55
 
53
56
  class Metrics:
@@ -77,6 +80,7 @@ class BenchmarkMetrics:
77
80
  n_total_prompt_tokens: int = 0
78
81
  n_total_completion_tokens: int = 0
79
82
  start_time: Optional[float] = None
83
+ last_completed_time: Optional[float] = None
80
84
  total_time: float = 1.0
81
85
  n_total_queries: int = 0
82
86
  n_time_per_output_token: float = 0.0
@@ -95,9 +99,6 @@ class BenchmarkMetrics:
95
99
 
96
100
  def update_metrics(self, benchmark_data: BenchmarkData, api_plugin):
97
101
  self.n_total_queries += 1
98
- if self.start_time is None:
99
- self.start_time = benchmark_data.start_time
100
- self.total_time = time.perf_counter() - self.start_time
101
102
 
102
103
  if benchmark_data.success:
103
104
  self.n_succeed_queries += 1
@@ -106,7 +107,6 @@ class BenchmarkMetrics:
106
107
  self.n_total_prompt_tokens += benchmark_data.prompt_tokens
107
108
  self.n_total_completion_tokens += benchmark_data.completion_tokens
108
109
 
109
- benchmark_data._calculate_query_stream_metric()
110
110
  self.total_latency += benchmark_data.query_latency
111
111
  self.total_first_chunk_latency += benchmark_data.first_chunk_latency
112
112
  self.n_time_per_output_token += benchmark_data.time_per_output_token
@@ -115,6 +115,22 @@ class BenchmarkMetrics:
115
115
  self.n_failed_queries += 1
116
116
 
117
117
  self.calculate_averages()
118
+ self.update_total_time(benchmark_data)
119
+
120
+ def update_total_time(self, benchmark_data: BenchmarkData):
121
+ # Use the earliest start_time seen so far
122
+ if self.start_time is None:
123
+ self.start_time = benchmark_data.start_time
124
+ else:
125
+ self.start_time = min(self.start_time, benchmark_data.start_time)
126
+ # Track the latest completion time
127
+ if self.last_completed_time is None:
128
+ self.last_completed_time = benchmark_data.completed_time
129
+ else:
130
+ self.last_completed_time = max(self.last_completed_time, benchmark_data.completed_time)
131
+ # Compute total_time from request lifecycle timestamps to avoid consumer overhead
132
+ if self.start_time is not None and self.last_completed_time is not None:
133
+ self.total_time = max(self.last_completed_time - self.start_time, 0.0)
118
134
 
119
135
  def calculate_averages(self):
120
136
  if self.n_succeed_queries == 0:
@@ -19,7 +19,7 @@ logger = get_logger()
19
19
  class DatabaseColumns:
20
20
  REQUEST = 'request'
21
21
  START_TIME = 'start_time'
22
- CHUNK_TIMES = 'chunk_times'
22
+ INTER_TOKEN_LATENCIES = 'inter_token_latencies'
23
23
  SUCCESS = 'success'
24
24
  RESPONSE_MESSAGES = 'response_messages'
25
25
  COMPLETED_TIME = 'completed_time'
@@ -60,7 +60,7 @@ def create_result_table(cursor):
60
60
  f'''CREATE TABLE IF NOT EXISTS result(
61
61
  {DatabaseColumns.REQUEST} TEXT,
62
62
  {DatabaseColumns.START_TIME} REAL,
63
- {DatabaseColumns.CHUNK_TIMES} TEXT,
63
+ {DatabaseColumns.INTER_TOKEN_LATENCIES} TEXT,
64
64
  {DatabaseColumns.SUCCESS} INTEGER,
65
65
  {DatabaseColumns.RESPONSE_MESSAGES} TEXT,
66
66
  {DatabaseColumns.COMPLETED_TIME} REAL,
@@ -75,15 +75,15 @@ def create_result_table(cursor):
75
75
 
76
76
 
77
77
  def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
78
- request = encode_data(benchmark_data.request)
79
- chunk_times = json.dumps(benchmark_data.chunk_times)
78
+ request = benchmark_data.request
79
+ inter_token_latencies = json.dumps(benchmark_data.inter_chunk_latency)
80
80
  response_messages = encode_data(benchmark_data.response_messages)
81
81
 
82
82
  # Columns common to both success and failure cases
83
83
  common_columns = (
84
84
  request,
85
85
  benchmark_data.start_time,
86
- chunk_times,
86
+ inter_token_latencies,
87
87
  benchmark_data.success,
88
88
  response_messages,
89
89
  benchmark_data.completed_time,
@@ -96,7 +96,7 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
96
96
  benchmark_data.completion_tokens, benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token
97
97
  )
98
98
  query = f"""INSERT INTO result(
99
- {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
99
+ {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
100
100
  {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
101
101
  {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY}, {DatabaseColumns.PROMPT_TOKENS},
102
102
  {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.MAX_GPU_MEMORY_COST},
@@ -105,7 +105,7 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
105
105
  cursor.execute(query, common_columns + additional_columns)
106
106
  else:
107
107
  query = f"""INSERT INTO result(
108
- {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
108
+ {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES},
109
109
  {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME}
110
110
  ) VALUES (?, ?, ?, ?, ?, ?)"""
111
111
  cursor.execute(query, common_columns)
@@ -173,20 +173,11 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
173
173
  :param result_db_path: Path to the SQLite database file.
174
174
  :return: Dictionary of percentiles for various metrics.
175
175
  """
176
-
177
- def inter_token_latencies(chunk_times_json: str) -> List[float]:
178
- try:
179
- chunk_times = json.loads(chunk_times_json)
180
- return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
181
- except (json.JSONDecodeError, TypeError) as e:
182
- logger.error(f'Error parsing chunk times: {e}')
183
- return []
184
-
185
- query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES}, {DatabaseColumns.SUCCESS},
176
+ query_sql = f'''SELECT {DatabaseColumns.START_TIME}, {DatabaseColumns.INTER_TOKEN_LATENCIES}, {DatabaseColumns.SUCCESS},
186
177
  {DatabaseColumns.COMPLETED_TIME}, {DatabaseColumns.LATENCY}, {DatabaseColumns.FIRST_CHUNK_LATENCY},
187
178
  {DatabaseColumns.PROMPT_TOKENS},
188
179
  {DatabaseColumns.COMPLETION_TOKENS}, {DatabaseColumns.TIME_PER_OUTPUT_TOKEN}
189
- FROM result WHERE {DatabaseColumns.SUCCESS}=1'''
180
+ FROM result WHERE {DatabaseColumns.SUCCESS}=1''' # noqa: E501
190
181
 
191
182
  percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
192
183
 
@@ -202,7 +193,11 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
202
193
  # Prepare data for each metric
203
194
  inter_token_latencies_all = []
204
195
  for row in rows:
205
- inter_token_latencies_all.extend(inter_token_latencies(row[col_indices[DatabaseColumns.CHUNK_TIMES]]))
196
+ try:
197
+ itl = json.loads(row[col_indices[DatabaseColumns.INTER_TOKEN_LATENCIES]]) or []
198
+ inter_token_latencies_all.extend(itl)
199
+ except (json.JSONDecodeError, TypeError) as e:
200
+ logger.error(f'Error parsing inter token latencies: {e}')
206
201
 
207
202
  metrics = {
208
203
  PercentileMetrics.TTFT: [row[col_indices[DatabaseColumns.FIRST_CHUNK_LATENCY]] for row in rows],
@@ -2,61 +2,18 @@ import os
2
2
  import subprocess
3
3
  import uvicorn
4
4
  from contextlib import asynccontextmanager
5
- from dataclasses import dataclass
6
5
  from fastapi import FastAPI
7
6
  from fastapi.middleware.cors import CORSMiddleware
8
7
  from sse_starlette.sse import EventSourceResponse
9
8
 
10
9
  from evalscope.perf.arguments import Arguments
11
10
  from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
11
+ from evalscope.utils.import_utils import check_import
12
12
  from evalscope.utils.logger import get_logger
13
13
 
14
14
  logger = get_logger()
15
15
 
16
16
 
17
- @dataclass
18
- class ServerSentEvent(object):
19
-
20
- def __init__(self, data='', event=None, id=None, retry=None):
21
- self.data = data
22
- self.event = event
23
- self.id = id
24
- self.retry = retry
25
-
26
- @classmethod
27
- def decode(cls, line):
28
- """Decode line to ServerSentEvent
29
-
30
-
31
- Args:
32
- line (str): The line.
33
-
34
- Return:
35
- ServerSentEvent (obj:`ServerSentEvent`): The ServerSentEvent object.
36
-
37
- """
38
- if not line:
39
- return None
40
- sse_msg = cls()
41
- # format data:xxx
42
- field_type, _, field_value = line.partition(':')
43
- if field_value.startswith(' '): # compatible with openai api
44
- field_value = field_value[1:]
45
- if field_type == 'event':
46
- sse_msg.event = field_value
47
- elif field_type == 'data':
48
- field_value = field_value.rstrip()
49
- sse_msg.data = field_value
50
- elif field_type == 'id':
51
- sse_msg.id = field_value
52
- elif field_type == 'retry':
53
- sse_msg.retry = field_value
54
- else:
55
- pass
56
-
57
- return sse_msg
58
-
59
-
60
17
  @asynccontextmanager
61
18
  async def lifespan(app: FastAPI):
62
19
  yield
@@ -101,6 +58,8 @@ def create_app(model, attn_implementation=None) -> FastAPI:
101
58
  def start_app(args: Arguments):
102
59
  logger.info('Starting local server, please wait...')
103
60
  if args.api == 'local':
61
+ check_import('torch', 'torch', raise_error=True)
62
+
104
63
  app = create_app(args.model, args.attn_implementation)
105
64
  uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
106
65
 
@@ -15,29 +15,42 @@ def init_wandb(args: Arguments) -> None:
15
15
  raise RuntimeError('Cannot import wandb. Please install it with command: \n pip install wandb')
16
16
  os.environ['WANDB_SILENT'] = 'true'
17
17
  os.environ['WANDB_DIR'] = args.outputs_dir
18
-
19
- wandb.login(key=args.wandb_api_key)
20
18
  current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
21
19
  name = args.name if args.name else f'{args.model_id}_{current_time}'
22
- wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
20
+
21
+ # Remove sensitive information from logging config
22
+ logging_config = args.to_dict()
23
+ logging_config.pop('api_key', None)
24
+ logging_config.pop('wandb_api_key', None)
25
+
26
+ if args.wandb_api_key is not None:
27
+ wandb.login(key=args.wandb_api_key)
28
+ wandb.init(project='perf_benchmark', name=name, config=logging_config)
23
29
 
24
30
 
25
31
  def init_swanlab(args: Arguments) -> None:
32
+ """
33
+ Initialize SwanLab for logging.
34
+ """
26
35
  import datetime
27
36
  try:
28
37
  import swanlab
29
38
  except ImportError:
30
39
  raise RuntimeError('Cannot import swanlab. Please install it with command: \n pip install swanlab')
31
40
  os.environ['SWANLAB_LOG_DIR'] = args.outputs_dir
32
- if not args.swanlab_api_key == 'local':
33
- swanlab.login(api_key=args.swanlab_api_key)
34
41
  current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
35
42
  name = args.name if args.name else f'{args.model_id}_{current_time}'
36
43
  swanlab.config.update({'framework': '📏evalscope'})
44
+
45
+ # Remove sensitive information from logging config
46
+ logging_config = args.to_dict()
47
+ logging_config.pop('api_key', None)
48
+ logging_config.pop('swanlab_api_key', None)
49
+
37
50
  init_kwargs = {
38
51
  'project': os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
39
52
  'name': name,
40
- 'config': args.to_dict(),
53
+ 'config': logging_config,
41
54
  'mode': 'local' if args.swanlab_api_key == 'local' else None
42
55
  }
43
56
 
@@ -45,4 +58,6 @@ def init_swanlab(args: Arguments) -> None:
45
58
  if workspace:
46
59
  init_kwargs['workspace'] = workspace
47
60
 
61
+ if isinstance(args.swanlab_api_key, str) and not args.swanlab_api_key == 'local':
62
+ swanlab.login(api_key=args.swanlab_api_key)
48
63
  swanlab.init(**init_kwargs)