evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,35 +1,38 @@
1
1
  import json
2
+ import math
2
3
  import os
3
- from typing import Any, Dict, Iterator, List, Union
4
+ from collections import defaultdict
5
+ from typing import Any, Dict, List, Tuple, Union
4
6
 
5
7
  from evalscope.perf.arguments import Arguments
6
- from evalscope.perf.plugin.api.base import ApiPluginBase
8
+ from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
7
9
  from evalscope.perf.plugin.registry import register_api
10
+ from evalscope.utils.io_utils import base64_to_PIL
8
11
  from evalscope.utils.logger import get_logger
9
12
 
10
13
  logger = get_logger()
11
14
 
12
15
 
13
16
  @register_api(['openai', 'local_vllm', 'local'])
14
- class OpenaiPlugin(ApiPluginBase):
17
+ class OpenaiPlugin(DefaultApiPlugin):
15
18
  """Base of openai interface."""
16
19
 
17
- def __init__(self, mode_path: str):
18
- """Init the plugin
20
+ def __init__(self, param: Arguments):
21
+ """Initialize the OpenaiPlugin.
19
22
 
20
23
  Args:
21
- mode_path (str): The model path, we use the tokenizer
22
- weight in the model to calculate the number of the
23
- input and output tokens.
24
+ param (Arguments): Configuration object containing parameters
25
+ such as the tokenizer path and model details. If a tokenizer
26
+ path is provided, it is used to initialize the tokenizer.
24
27
  """
25
- super().__init__(model_path=mode_path)
26
- if mode_path is not None:
27
- from transformers import AutoTokenizer
28
- self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
28
+ super().__init__(param=param)
29
+ if param.tokenizer_path is not None:
30
+ from modelscope import AutoTokenizer
31
+ self.tokenizer = AutoTokenizer.from_pretrained(param.tokenizer_path)
29
32
  else:
30
33
  self.tokenizer = None
31
34
 
32
- def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
35
+ def build_request(self, messages: Union[List[Dict], str], param: Arguments = None) -> Dict:
33
36
  """Build the openai format request based on prompt, dataset
34
37
 
35
38
  Args:
@@ -42,6 +45,7 @@ class OpenaiPlugin(ApiPluginBase):
42
45
  Returns:
43
46
  Dict: The request body. None if prompt format is error.
44
47
  """
48
+ param = param or self.param
45
49
  try:
46
50
  if param.query_template is not None:
47
51
  if param.query_template.startswith('@'):
@@ -54,8 +58,6 @@ class OpenaiPlugin(ApiPluginBase):
54
58
  else:
55
59
  query = json.loads(param.query_template)
56
60
 
57
- if 'stream' in query.keys():
58
- param.stream = query['stream']
59
61
  # replace template messages with input messages.
60
62
  query['messages'] = messages
61
63
  elif isinstance(messages, str):
@@ -75,6 +77,8 @@ class OpenaiPlugin(ApiPluginBase):
75
77
  payload['min_tokens'] = param.min_tokens
76
78
  if param.frequency_penalty is not None:
77
79
  payload['frequency_penalty'] = param.frequency_penalty
80
+ if param.repetition_penalty is not None:
81
+ payload['repetition_penalty'] = param.repetition_penalty
78
82
  if param.logprobs is not None:
79
83
  payload['logprobs'] = param.logprobs
80
84
  if param.n_choices is not None:
@@ -92,68 +96,143 @@ class OpenaiPlugin(ApiPluginBase):
92
96
  payload['temperature'] = param.temperature
93
97
  if param.top_p is not None:
94
98
  payload['top_p'] = param.top_p
99
+ if param.top_k is not None:
100
+ payload['top_k'] = param.top_k
101
+ if param.extra_args is not None:
102
+ payload.update(param.extra_args)
95
103
  return payload
96
104
 
97
- def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
105
+ def parse_responses(self, responses: List[Dict], request: str = None, **kwargs) -> tuple[int, int]:
98
106
  """Parser responses and return number of request and response tokens.
99
107
  Only one response for non-stream, multiple responses for stream.
100
108
  """
101
109
 
102
110
  # when stream, the last response is the full usage
103
111
  # when non-stream, the last response is the first response
104
- last_response_js = json.loads(responses[-1])
112
+ last_response_js = responses[-1]
105
113
  if 'usage' in last_response_js and last_response_js['usage']:
106
114
  input_tokens = last_response_js['usage']['prompt_tokens']
107
115
  output_tokens = last_response_js['usage']['completion_tokens']
108
116
  return input_tokens, output_tokens
109
117
 
110
118
  # no usage information in the response, parse the response to get the tokens
111
- delta_contents = {}
119
+ delta_contents = defaultdict(list)
112
120
  for response in responses:
113
- js = json.loads(response)
114
- if 'object' in js:
115
- self.__process_response_object(js, delta_contents)
121
+ if 'object' in response:
122
+ self.__process_response_object(response, delta_contents)
116
123
  else:
117
- self.__process_no_object(js, delta_contents)
124
+ self.__process_no_object(response, delta_contents)
118
125
 
119
126
  input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
120
127
  return input_tokens, output_tokens
121
128
 
122
- def __process_response_object(self, js, delta_contents):
123
- if js['object'] == 'chat.completion':
124
- for choice in js['choices']:
129
+ def __process_response_object(self, response, delta_contents):
130
+ if not response.get('choices'):
131
+ return
132
+ if response['object'] == 'chat.completion':
133
+ for choice in response['choices']:
125
134
  delta_contents[choice['index']] = [choice['message']['content']]
126
- elif js['object'] == 'text_completion':
127
- for choice in js['choices']:
128
- delta_contents[choice['index']] = [choice['text']]
129
- elif js['object'] == 'chat.completion.chunk':
130
- for choice in js.get('choices', []):
135
+ elif response['object'] == 'text_completion':
136
+ for choice in response['choices']:
137
+ if 'text' in choice and 'index' in choice:
138
+ delta_contents[choice['index']].append(choice['text'])
139
+ elif response['object'] == 'chat.completion.chunk':
140
+ for choice in response['choices']:
131
141
  if 'delta' in choice and 'index' in choice:
132
142
  delta = choice['delta']
133
143
  idx = choice['index']
134
144
  if 'content' in delta:
135
- delta_content = delta['content']
136
- delta_contents.setdefault(idx, []).append(delta_content)
145
+ delta_contents[idx].append(delta['content'])
137
146
 
138
- def __process_no_object(self, js, delta_contents):
147
+ def __process_no_object(self, response, delta_contents):
139
148
  # assume the response is a single choice
140
- for choice in js['choices']:
149
+ if not response.get('choices'):
150
+ return
151
+ for choice in response['choices']:
141
152
  if 'delta' in choice:
142
153
  delta = choice['delta']
143
154
  idx = choice['index']
144
155
  if 'content' in delta:
145
- delta_content = delta['content']
146
- delta_contents.setdefault(idx, []).append(delta_content)
156
+ delta_contents[idx].append(delta['content'])
147
157
  else:
148
158
  delta_contents[choice['index']] = [choice['message']['content']]
149
159
 
150
- def __calculate_tokens_from_content(self, request, delta_contents):
160
+ def __calculate_tokens_from_content(self, request, content):
151
161
  input_tokens = output_tokens = 0
152
162
  if self.tokenizer is not None:
153
- for idx, choice_contents in delta_contents.items():
163
+ # Calculate input tokens
164
+ input_tokens += self._count_input_tokens(request)
165
+ for idx, choice_contents in content.items():
154
166
  full_response_content = ''.join(choice_contents)
155
- input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
156
- output_tokens += len(self.tokenizer.encode(full_response_content))
167
+ # Calculate output tokens
168
+ output_tokens += self._count_output_tokens(full_response_content)
157
169
  else:
158
- logger.warning('No usage information found. Please specify `--tokenizer-path` to generate usage details.')
170
+ raise ValueError(
171
+ 'Error: Unable to retrieve usage information\n\n'
172
+ 'This error occurs when:\n'
173
+ '1. The API response does not contain usage data, AND\n'
174
+ '2. No tokenizer has been specified or found.\n\n'
175
+ 'To resolve this issue, do ONE of the following:\n'
176
+ "a) Ensure that the API you're using supports and returns usage information, OR\n"
177
+ 'b) Specify a tokenizer using the `--tokenizer-path` parameter.\n\n'
178
+ 'If you continue to experience issues, '
179
+ 'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .'
180
+ )
159
181
  return input_tokens, output_tokens
182
+
183
+ def _count_input_tokens(self, request_str: str) -> int:
184
+ """Count the number of input tokens in the request.
185
+
186
+ This method handles different types of requests and calculates tokens for:
187
+ - Text content in messages or prompts
188
+ - Images in multimodal messages (converted to patch tokens)
189
+
190
+ Args:
191
+ request_str (str): The request json str containing either 'messages' for chat
192
+ completion or 'prompt' for text completion.
193
+
194
+ Returns:
195
+ int: The total number of input tokens including text and image tokens.
196
+ """
197
+ input_tokens = 0
198
+ request = json.loads(request_str)
199
+ if 'messages' in request:
200
+ input_content = self.tokenizer.apply_chat_template(
201
+ request['messages'], tokenize=True, add_generation_prompt=True
202
+ )
203
+ input_tokens += len(input_content)
204
+ # handle image tokens if any
205
+ for message in request['messages']:
206
+ content = message.get('content', '')
207
+ if isinstance(content, str):
208
+ continue
209
+ for cont in content:
210
+ if cont['type'] == 'image_url':
211
+ try:
212
+ # assuming image_url is base64 string
213
+ image_base64 = cont['image_url']['url']
214
+ image = base64_to_PIL(image_base64)
215
+ # Use math.ceil for more accurate token count when image dimensions
216
+ # aren't perfectly divisible by patch size
217
+ n_patches = (
218
+ math.ceil(image.height / self.param.image_patch_size)
219
+ * math.ceil(image.width / self.param.image_patch_size)
220
+ )
221
+ input_tokens += n_patches
222
+ except Exception as e:
223
+ logger.warning(f'Failed to process image for token counting: {e}')
224
+ # Continue processing other content without failing
225
+ elif 'prompt' in request:
226
+ input_tokens += len(self.tokenizer.encode(request['prompt'], add_special_tokens=False))
227
+ return input_tokens
228
+
229
+ def _count_output_tokens(self, response: str) -> int:
230
+ """Count the number of output tokens in the response. Only string response is supported.
231
+
232
+ Args:
233
+ response (str): The API response text.
234
+
235
+ Returns:
236
+ int: The number of output tokens.
237
+ """
238
+ return len(self.tokenizer.encode(response, add_special_tokens=False))
@@ -1,6 +1,10 @@
1
- from evalscope.perf.plugin.datasets.custom import CustomDatasetPlugin
2
- from evalscope.perf.plugin.datasets.flickr8k import FlickrDatasetPlugin
3
- from evalscope.perf.plugin.datasets.line_by_line import LineByLineDatasetPlugin
4
- from evalscope.perf.plugin.datasets.longalpaca import LongAlpacaDatasetPlugin
5
- from evalscope.perf.plugin.datasets.openqa import OpenqaDatasetPlugin
6
- from evalscope.perf.plugin.datasets.speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
1
+ from .base import DatasetPluginBase
2
+ from .custom import CustomDatasetPlugin
3
+ from .flickr8k import FlickrDatasetPlugin
4
+ from .kontext_bench import KontextDatasetPlugin
5
+ from .line_by_line import LineByLineDatasetPlugin
6
+ from .longalpaca import LongAlpacaDatasetPlugin
7
+ from .openqa import OpenqaDatasetPlugin
8
+ from .random_dataset import RandomDatasetPlugin
9
+ from .random_vl_dataset import RandomVLDatasetPlugin
10
+ from .speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import sys
3
3
  from abc import abstractmethod
4
- from typing import Any, Dict, Iterator, List, Tuple
4
+ from typing import Any, Dict, Iterator, List, Tuple, Union
5
5
 
6
6
  from evalscope.perf.arguments import Arguments
7
7
 
@@ -15,6 +15,11 @@ class DatasetPluginBase:
15
15
  dataset_path (str, optional): The input dataset path. Defaults to None.
16
16
  """
17
17
  self.query_parameters = query_parameters
18
+ if query_parameters.tokenizer_path:
19
+ from modelscope import AutoTokenizer
20
+ self.tokenizer = AutoTokenizer.from_pretrained(query_parameters.tokenizer_path, trust_remote_code=True)
21
+ else:
22
+ self.tokenizer = None
18
23
 
19
24
  def __next__(self):
20
25
  for item in self.build_messages():
@@ -64,3 +69,40 @@ class DatasetPluginBase:
64
69
  data = json.loads(content)
65
70
  for item in data:
66
71
  yield item
72
+
73
+ def create_message(self, text: str, image_urls: Union[List[str], str] = None, role: str = 'user') -> Dict:
74
+ """Create a message with text and optional image URLs.
75
+
76
+ Args:
77
+ text (str): The text content of the message.
78
+ image_urls (List[str], optional): List of image URLs. Defaults to None.
79
+ role (str, optional): The role of the message sender. Defaults to "user".
80
+
81
+ Returns:
82
+ Dict: A dictionary representing the message.
83
+ """
84
+ if image_urls is None:
85
+ message = {'role': role, 'content': text}
86
+ else:
87
+ message = {'role': role, 'content': [{'type': 'text', 'text': text}]}
88
+ if isinstance(image_urls, str):
89
+ image_urls = [image_urls]
90
+ for url in image_urls:
91
+ message['content'].append({'type': 'image_url', 'image_url': {'url': url}})
92
+ return message
93
+
94
+ def check_prompt_length(self, prompt: str) -> Tuple[bool, int]:
95
+ """Check if the prompt length is within the specified range.
96
+
97
+ Args:
98
+ prompt (str): The input prompt string.
99
+
100
+ Returns:
101
+ Tuple[bool, int]: A tuple containing a boolean indicating whether the prompt is valid and its length.
102
+ """
103
+ if self.tokenizer is None:
104
+ prompt_length = len(prompt)
105
+ else:
106
+ prompt_length = len(self.tokenizer.encode(prompt))
107
+ is_valid = self.query_parameters.min_prompt_length <= prompt_length <= self.query_parameters.max_prompt_length
108
+ return is_valid, prompt_length
@@ -16,6 +16,25 @@ class CustomDatasetPlugin(DatasetPluginBase):
16
16
  def build_messages(self) -> Iterator[List[Dict]]:
17
17
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
18
18
  prompt = item.strip()
19
- if len(prompt) > self.query_parameters.min_prompt_length and len(
20
- prompt) < self.query_parameters.max_prompt_length:
21
- yield [{'role': 'user', 'content': prompt}]
19
+ is_valid, _ = self.check_prompt_length(prompt)
20
+ if is_valid:
21
+ if self.query_parameters.apply_chat_template:
22
+ message = self.create_message(prompt)
23
+ yield [message]
24
+ else:
25
+ yield prompt
26
+
27
+
28
+ if __name__ == '__main__':
29
+ from evalscope.perf.arguments import Arguments
30
+ from evalscope.perf.main import run_perf_benchmark
31
+
32
+ args = Arguments(
33
+ model='qwen2.5-7b-instruct',
34
+ url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
35
+ dataset_path='outputs/perf_data.txt',
36
+ api_key='EMPTY',
37
+ dataset='custom',
38
+ )
39
+
40
+ run_perf_benchmark(args)
@@ -1,18 +1,9 @@
1
- import base64
2
- from io import BytesIO
3
- from PIL import Image
4
1
  from typing import Any, Dict, Iterator, List
5
2
 
6
3
  from evalscope.perf.arguments import Arguments
7
4
  from evalscope.perf.plugin.datasets.base import DatasetPluginBase
8
5
  from evalscope.perf.plugin.registry import register_dataset
9
-
10
-
11
- def PIL_to_base64(image: Image.Image) -> str:
12
- buffered = BytesIO()
13
- image.save(buffered, format='JPEG')
14
- img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
15
- return img_str
6
+ from evalscope.utils.io_utils import PIL_to_base64
16
7
 
17
8
 
18
9
  @register_dataset('flickr8k')
@@ -30,21 +21,8 @@ class FlickrDatasetPlugin(DatasetPluginBase):
30
21
 
31
22
  for item in dataset:
32
23
  pil_image = item['jpg']
33
- base64_iamge = PIL_to_base64(pil_image)
24
+ text = item['txt']
25
+ base64_image = PIL_to_base64(pil_image, add_header=True)
34
26
 
35
- yield [{
36
- 'role':
37
- 'user',
38
- 'content': [
39
- {
40
- 'type': 'text',
41
- 'text': 'Describe the image'
42
- },
43
- {
44
- 'type': 'image_url',
45
- 'image_url': {
46
- 'url': f'data:image/jpeg;base64,{base64_iamge}',
47
- }
48
- },
49
- ],
50
- }]
27
+ message = self.create_message(text=text, image_urls=base64_image)
28
+ yield [message]
@@ -0,0 +1,28 @@
1
+ from typing import Any, Dict, Iterator, List
2
+
3
+ from evalscope.perf.arguments import Arguments
4
+ from evalscope.perf.plugin.datasets.base import DatasetPluginBase
5
+ from evalscope.perf.plugin.registry import register_dataset
6
+ from evalscope.utils.io_utils import PIL_to_base64
7
+
8
+
9
+ @register_dataset('kontext_bench')
10
+ class KontextDatasetPlugin(DatasetPluginBase):
11
+ """Read dataset and return prompt.
12
+ Datasets: https://modelscope.cn/datasets/black-forest-labs/kontext-bench/dataPeview
13
+ """
14
+
15
+ def __init__(self, query_parameters: Arguments):
16
+ super().__init__(query_parameters)
17
+
18
+ def build_messages(self) -> Iterator[List[Dict]]:
19
+ from modelscope.msdatasets import MsDataset
20
+ dataset = MsDataset.load('black-forest-labs/kontext-bench', subset_name='default', split='test')
21
+
22
+ for item in dataset:
23
+ pil_image = item['image']
24
+ text = item['instruction']
25
+ base64_image = PIL_to_base64(pil_image, add_header=True)
26
+
27
+ message = self.create_message(text=text, image_urls=base64_image)
28
+ yield [message]
@@ -17,6 +17,10 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
17
17
  def build_messages(self) -> Iterator[List[Dict]]:
18
18
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
19
19
  prompt = item.strip()
20
- if len(prompt) > self.query_parameters.min_prompt_length and len(
21
- prompt) < self.query_parameters.max_prompt_length:
22
- yield [{'role': 'user', 'content': prompt}]
20
+ is_valid, _ = self.check_prompt_length(prompt)
21
+ if is_valid:
22
+ if self.query_parameters.apply_chat_template:
23
+ message = self.create_message(prompt)
24
+ yield [message]
25
+ else:
26
+ yield prompt
@@ -22,6 +22,10 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
22
22
  ds = self.dataset_json_list(self.query_parameters.dataset_path)
23
23
  for item in ds:
24
24
  prompt = item['instruction'].strip()
25
- if len(prompt) > self.query_parameters.min_prompt_length and len(
26
- prompt) < self.query_parameters.max_prompt_length:
27
- yield [{'role': 'user', 'content': prompt}]
25
+ is_valid, _ = self.check_prompt_length(prompt)
26
+ if is_valid:
27
+ if self.query_parameters.apply_chat_template:
28
+ message = self.create_message(prompt)
29
+ yield [message]
30
+ else:
31
+ yield prompt
@@ -1,5 +1,5 @@
1
1
  import json
2
- import subprocess
2
+ import os
3
3
  from typing import Any, Dict, Iterator, List
4
4
 
5
5
  from evalscope.perf.arguments import Arguments
@@ -18,20 +18,19 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
18
18
 
19
19
  def build_messages(self) -> Iterator[List[Dict]]:
20
20
  if not self.query_parameters.dataset_path:
21
- subprocess.call([
22
- 'modelscope',
23
- 'download',
24
- '--dataset',
25
- 'AI-ModelScope/HC3-Chinese',
26
- 'open_qa.jsonl',
27
- '--local_dir',
28
- './data',
29
- ])
30
- self.query_parameters.dataset_path = './data/open_qa.jsonl'
21
+ from modelscope import dataset_snapshot_download
22
+
23
+ file_name = 'open_qa.jsonl'
24
+ local_path = dataset_snapshot_download('AI-ModelScope/HC3-Chinese', allow_patterns=[file_name])
25
+ self.query_parameters.dataset_path = os.path.join(local_path, file_name)
31
26
 
32
27
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
33
28
  item = json.loads(item)
34
29
  prompt = item['question'].strip()
35
- if (len(prompt) > self.query_parameters.min_prompt_length
36
- and len(prompt) < self.query_parameters.max_prompt_length):
37
- yield [{'role': 'user', 'content': prompt}]
30
+ is_valid, _ = self.check_prompt_length(prompt)
31
+ if is_valid:
32
+ if self.query_parameters.apply_chat_template:
33
+ message = self.create_message(prompt)
34
+ yield [message]
35
+ else:
36
+ yield prompt
@@ -0,0 +1,67 @@
1
+ import numpy as np
2
+ from typing import Dict, Iterator, List
3
+
4
+ from evalscope.perf.arguments import Arguments
5
+ from evalscope.perf.plugin.datasets.base import DatasetPluginBase
6
+ from evalscope.perf.plugin.registry import register_dataset
7
+
8
+
9
+ @register_dataset('random')
10
+ class RandomDatasetPlugin(DatasetPluginBase):
11
+ """Read dataset and return prompt.
12
+ """
13
+
14
+ def __init__(self, query_parameters: Arguments):
15
+ assert query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer-path`.' # noqa: E501
16
+ super().__init__(query_parameters)
17
+
18
+ self.prefix_length = self.query_parameters.prefix_length
19
+ self.prefix_ids = self.get_random_inputs(self.prefix_length)
20
+ self.template_len = self.get_template_len()
21
+ self.number = self.query_parameters.number or 1
22
+
23
+ def build_messages(self) -> Iterator[List[Dict]]:
24
+ if self.query_parameters.apply_chat_template:
25
+ min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
26
+ max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
27
+ else:
28
+ min_prompt_length = self.query_parameters.min_prompt_length
29
+ max_prompt_length = self.query_parameters.max_prompt_length + 1
30
+
31
+ assert min_prompt_length >= 0, f'min_prompt_length should be greater than or equal to the template length {self.template_len}.' # noqa: E501
32
+ assert max_prompt_length >= min_prompt_length, 'max_prompt_length should be greater than or equal to min_prompt_length.' # noqa: E501
33
+
34
+ # refer to https://github.com/vllm-project/vllm/blob/ed6e9075d31e32c8548b480a47d1ffb77da1f54c/benchmarks/benchmark_serving.py#L366C1-L399C1 # noqa: E501
35
+ input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
36
+ offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
37
+
38
+ vocab_size = self.tokenizer.vocab_size
39
+
40
+ for i in range(self.number):
41
+ inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) % vocab_size).tolist()
42
+ token_sequence = self.prefix_ids + inner_seq
43
+ prompt = self.tokenizer.decode(token_sequence)
44
+
45
+ # After decoding the prompt we have to encode and decode it again.
46
+ # This is done because in some cases N consecutive tokens
47
+ # give a string tokenized into != N number of tokens.
48
+ total_input_len = self.prefix_length + int(input_lens[i])
49
+ re_encoded_sequence = self.tokenizer.encode(prompt, add_special_tokens=False)[:total_input_len]
50
+ prompt = self.tokenizer.decode(re_encoded_sequence)
51
+
52
+ if self.query_parameters.apply_chat_template:
53
+ message = self.create_message(prompt)
54
+ yield [message]
55
+ else:
56
+ yield prompt
57
+
58
+ def get_random_inputs(self, length: int) -> List[int]:
59
+ if length <= 0:
60
+ return []
61
+ input_ids = np.random.randint(0, self.tokenizer.vocab_size, size=length).tolist()
62
+ return input_ids
63
+
64
+ def get_template_len(self):
65
+ empty_message = [self.create_message(text='')]
66
+ template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
67
+ return len(template)
@@ -0,0 +1,80 @@
1
+ import random
2
+ from PIL import Image, ImageDraw
3
+ from typing import Dict, Iterator, List
4
+
5
+ from evalscope.perf.arguments import Arguments
6
+ from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
7
+ from evalscope.perf.plugin.registry import register_dataset
8
+ from evalscope.utils.io_utils import PIL_to_base64
9
+
10
+
11
+ @register_dataset('random_vl')
12
+ class RandomVLDatasetPlugin(RandomDatasetPlugin):
13
+ """Random Vision-Language Dataset Plugin for multimodal model stress testing."""
14
+
15
+ def __init__(self, query_parameters: Arguments):
16
+ super().__init__(query_parameters)
17
+
18
+ # Vision-language specific parameters
19
+ self.image_width = query_parameters.image_width
20
+ self.image_height = query_parameters.image_height
21
+ self.image_format = query_parameters.image_format
22
+ self.image_num = query_parameters.image_num
23
+
24
+ assert self.image_num > 0, 'image_num must be greater than 0.'
25
+
26
+ def build_messages(self) -> Iterator[List[Dict]]:
27
+ # Reuse parent's message generation logic
28
+ for messages in super().build_messages():
29
+ prompt = messages[0]['content'] if isinstance(messages[0], dict) else messages[0]
30
+
31
+ # Generate random images based on image_num
32
+ images_b64 = []
33
+ for _ in range(self.image_num):
34
+ images_b64.append(self._generate_random_image_b64())
35
+
36
+ message = self.create_message(text=prompt, image_urls=images_b64)
37
+ yield [message]
38
+
39
+ def _generate_random_image_b64(self) -> str:
40
+ """Generate a random image and return as base64 string."""
41
+ # Create a random colored image
42
+ color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
43
+ image = Image.new(self.image_format, (self.image_width, self.image_height), color)
44
+
45
+ # Add some random shapes for variety
46
+ draw = ImageDraw.Draw(image)
47
+ for _ in range(random.randint(1, 5)):
48
+ shape_type = random.choice(['rectangle', 'ellipse', 'line'])
49
+
50
+ # Generate two random points
51
+ x1 = random.randint(0, self.image_width - 1)
52
+ y1 = random.randint(0, self.image_height - 1)
53
+ x2 = random.randint(0, self.image_width - 1)
54
+ y2 = random.randint(0, self.image_height - 1)
55
+
56
+ # Ensure proper coordinate ordering (x1 <= x2, y1 <= y2)
57
+ if x1 > x2:
58
+ x1, x2 = x2, x1
59
+ if y1 > y2:
60
+ y1, y2 = y2, y1
61
+
62
+ # Ensure we have at least a 1-pixel difference
63
+ if x1 == x2:
64
+ x2 = min(x1 + 1, self.image_width - 1)
65
+ if y1 == y2:
66
+ y2 = min(y1 + 1, self.image_height - 1)
67
+
68
+ coords = [x1, y1, x2, y2]
69
+
70
+ shape_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
71
+
72
+ if shape_type == 'rectangle':
73
+ draw.rectangle(coords, fill=shape_color)
74
+ elif shape_type == 'ellipse':
75
+ draw.ellipse(coords, fill=shape_color)
76
+ else:
77
+ draw.line(coords, fill=shape_color, width=random.randint(1, 5))
78
+
79
+ # Convert to base64
80
+ return PIL_to_base64(image, format='PNG', add_header=True)