evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,708 @@
1
+ import base64
2
+ import json
3
+ import re
4
+ from collections import defaultdict
5
+ from copy import copy
6
+ from openai import APIStatusError, OpenAIError
7
+ from openai.types.chat import (
8
+ ChatCompletion,
9
+ ChatCompletionAssistantMessageParam,
10
+ ChatCompletionChunk,
11
+ ChatCompletionContentPartImageParam,
12
+ ChatCompletionContentPartInputAudioParam,
13
+ ChatCompletionContentPartParam,
14
+ ChatCompletionContentPartRefusalParam,
15
+ ChatCompletionContentPartTextParam,
16
+ ChatCompletionDeveloperMessageParam,
17
+ ChatCompletionMessage,
18
+ ChatCompletionMessageParam,
19
+ ChatCompletionMessageToolCall,
20
+ ChatCompletionMessageToolCallParam,
21
+ ChatCompletionNamedToolChoiceParam,
22
+ ChatCompletionSystemMessageParam,
23
+ ChatCompletionToolChoiceOptionParam,
24
+ ChatCompletionToolMessageParam,
25
+ ChatCompletionToolParam,
26
+ ChatCompletionUserMessageParam,
27
+ )
28
+ from openai.types.chat.chat_completion import Choice, ChoiceLogprobs
29
+ from openai.types.chat.chat_completion_message_tool_call import Function
30
+ from openai.types.completion_usage import CompletionUsage
31
+ from openai.types.shared_params.function_definition import FunctionDefinition
32
+ from pydantic import JsonValue
33
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Union
34
+
35
+ from evalscope.api.messages import (
36
+ ChatMessage,
37
+ ChatMessageAssistant,
38
+ ChatMessageSystem,
39
+ ChatMessageTool,
40
+ ChatMessageUser,
41
+ Content,
42
+ ContentAudio,
43
+ ContentImage,
44
+ ContentReasoning,
45
+ ContentText,
46
+ parse_content_with_reasoning,
47
+ )
48
+ from evalscope.api.model import (
49
+ ChatCompletionChoice,
50
+ GenerateConfig,
51
+ Logprobs,
52
+ ModelOutput,
53
+ ModelUsage,
54
+ StopReason,
55
+ as_stop_reason,
56
+ )
57
+ from evalscope.api.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo, parse_tool_call
58
+ from evalscope.utils.url_utils import file_as_data_uri, is_http_url
59
+
60
+ BASE_64_DATA_REMOVED = '<base64-data-removed>'
61
+
62
+
63
+ class OpenAIResponseError(OpenAIError):
64
+
65
+ def __init__(self, code: str, message: str) -> None:
66
+ self.code = code
67
+ self.message = message
68
+
69
+ def __str__(self) -> str:
70
+ return f'{self.code}: {self.message}'
71
+
72
+
73
+ def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
74
+ return ChatCompletionMessageToolCall(
75
+ type='function',
76
+ id=tool_call.id,
77
+ function=Function(name=tool_call.function.name, arguments=json.dumps(tool_call.function.arguments)),
78
+ )
79
+
80
+
81
+ def openai_chat_tool_call_param(tool_call: ToolCall) -> ChatCompletionMessageToolCallParam:
82
+ return ChatCompletionMessageToolCallParam(
83
+ id=tool_call.id,
84
+ function=dict(name=tool_call.function.name, arguments=json.dumps(tool_call.function.arguments)),
85
+ type='function',
86
+ )
87
+
88
+
89
+ def openai_chat_completion_part(content: Content) -> ChatCompletionContentPartParam:
90
+ if content.type == 'text':
91
+ return ChatCompletionContentPartTextParam(type='text', text=content.text)
92
+ elif content.type == 'image':
93
+ # API takes URL or base64 encoded file. If it's a remote file or
94
+ # data URL leave it alone, otherwise encode it
95
+ image_url = content.image
96
+ detail = content.detail
97
+
98
+ if not is_http_url(image_url):
99
+ image_url = file_as_data_uri(image_url)
100
+
101
+ return ChatCompletionContentPartImageParam(
102
+ type='image_url',
103
+ image_url=dict(url=image_url, detail=detail),
104
+ )
105
+ elif content.type == 'audio':
106
+ audio_data_uri = file_as_data_uri(content.audio)
107
+
108
+ return ChatCompletionContentPartInputAudioParam(
109
+ type='input_audio', input_audio=dict(data=audio_data_uri, format=content.format)
110
+ )
111
+
112
+ else:
113
+ raise RuntimeError('Video content is not currently supported by Open AI chat models.')
114
+
115
+
116
+ def openai_chat_message(
117
+ message: ChatMessage, system_role: Literal['user', 'system', 'developer'] = 'system'
118
+ ) -> ChatCompletionMessageParam:
119
+ if message.role == 'system':
120
+ if system_role == 'user':
121
+ return ChatCompletionUserMessageParam(role='user', content=message.text)
122
+ elif system_role == 'system':
123
+ return ChatCompletionSystemMessageParam(role=message.role, content=message.text)
124
+ elif system_role == 'developer':
125
+ return ChatCompletionDeveloperMessageParam(role='developer', content=message.text)
126
+ elif message.role == 'user':
127
+ return ChatCompletionUserMessageParam(
128
+ role=message.role,
129
+ content=(
130
+ message.content if isinstance(message.content, str) else
131
+ [openai_chat_completion_part(content) for content in message.content]
132
+ ),
133
+ )
134
+ elif message.role == 'assistant':
135
+ if message.tool_calls:
136
+ return ChatCompletionAssistantMessageParam(
137
+ role=message.role,
138
+ content=openai_assistant_content(message),
139
+ tool_calls=[openai_chat_tool_call_param(call) for call in message.tool_calls],
140
+ )
141
+ else:
142
+ return ChatCompletionAssistantMessageParam(role=message.role, content=openai_assistant_content(message))
143
+ elif message.role == 'tool':
144
+ return ChatCompletionToolMessageParam(
145
+ role=message.role,
146
+ content=(f'Error: {message.error.message}' if message.error else message.text),
147
+ tool_call_id=str(message.tool_call_id),
148
+ )
149
+ else:
150
+ raise ValueError(f'Unexpected message role {message.role}')
151
+
152
+
153
+ def openai_chat_messages(
154
+ messages: List[ChatMessage],
155
+ system_role: Literal['user', 'system', 'developer'] = 'system',
156
+ ) -> List[ChatCompletionMessageParam]:
157
+ return [openai_chat_message(message, system_role) for message in messages]
158
+
159
+
160
+ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) -> Dict[str, Any]:
161
+ params: Dict[str, Any] = dict(model=model)
162
+ # handle stream option
163
+ if config.stream is not None:
164
+ params['stream'] = config.stream
165
+ if config.stream:
166
+ params['stream_options'] = {'include_usage': True}
167
+ if config.timeout is not None:
168
+ params['timeout'] = config.timeout
169
+ if config.max_tokens is not None:
170
+ params['max_tokens'] = config.max_tokens
171
+ if config.frequency_penalty is not None:
172
+ params['frequency_penalty'] = config.frequency_penalty
173
+ if config.stop_seqs is not None:
174
+ params['stop'] = config.stop_seqs
175
+ if config.presence_penalty is not None:
176
+ params['presence_penalty'] = config.presence_penalty
177
+ if config.repetition_penalty is not None:
178
+ params['repetition_penalty'] = config.repetition_penalty
179
+ if config.logit_bias is not None:
180
+ params['logit_bias'] = config.logit_bias
181
+ if config.seed is not None:
182
+ params['seed'] = config.seed
183
+ if config.temperature is not None:
184
+ params['temperature'] = config.temperature
185
+ if config.top_p is not None:
186
+ params['top_p'] = config.top_p
187
+ if config.top_k is not None:
188
+ params['top_k'] = config.top_k
189
+ if config.n is not None:
190
+ params['n'] = config.n
191
+ if config.logprobs is not None:
192
+ params['logprobs'] = config.logprobs
193
+ if config.top_logprobs is not None:
194
+ params['top_logprobs'] = config.top_logprobs
195
+ if tools and config.parallel_tool_calls is not None:
196
+ params['parallel_tool_calls'] = config.parallel_tool_calls
197
+ if config.reasoning_effort is not None:
198
+ params['reasoning_effort'] = config.reasoning_effort
199
+ if config.response_schema is not None:
200
+ params['response_format'] = dict(
201
+ type='json_schema',
202
+ json_schema=dict(
203
+ name=config.response_schema.name,
204
+ schema=config.response_schema.json_schema.model_dump(exclude_none=True),
205
+ description=config.response_schema.description,
206
+ strict=config.response_schema.strict,
207
+ ),
208
+ )
209
+ if config.extra_body:
210
+ params['extra_body'] = config.extra_body
211
+ if config.extra_query:
212
+ params['extra_query'] = config.extra_query
213
+ if config.extra_headers:
214
+ params['extra_headers'] = config.extra_headers
215
+
216
+ return params
217
+
218
+
219
+ def openai_assistant_content(message: ChatMessageAssistant, include_reasoning=True) -> str:
220
+ # In agent bridge scenarios, we could encounter concepts such as reasoning and
221
+ # .internal use in the ChatMessageAssistant that are not supported by the OpenAI
222
+ # choices API. This code smuggles that data into the plain text so that it
223
+ # survives multi-turn round trips.
224
+
225
+ if isinstance(message.content, str):
226
+ content = message.content
227
+ else:
228
+ content = ''
229
+ for c in message.content:
230
+ if c.type == 'reasoning' and include_reasoning:
231
+ attribs = ''
232
+ if c.signature is not None:
233
+ attribs = f'{attribs} signature="{c.signature}"'
234
+ if c.redacted:
235
+ attribs = f'{attribs} redacted="true"'
236
+ content = f'{content}\n<think{attribs}>\n{c.reasoning}\n</think>\n'
237
+ elif c.type == 'text':
238
+ content = f'{content}\n{c.text}'
239
+
240
+ if message.internal:
241
+ content = f"""{content}\n<internal>{
242
+ base64.b64encode(json.dumps(message.internal).encode("utf-8")).decode(
243
+ "utf-8"
244
+ )
245
+ }</internal>\n"""
246
+ return content
247
+
248
+
249
+ def openai_chat_choices(choices: List[ChatCompletionChoice], include_reasoning: bool = True) -> List[Choice]:
250
+ oai_choices: List[Choice] = []
251
+
252
+ for index, choice in enumerate(choices):
253
+ # Handle content
254
+ content = openai_assistant_content(choice.message, include_reasoning=include_reasoning)
255
+
256
+ # Handle tool calls
257
+ if choice.message.tool_calls:
258
+ tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
259
+ else:
260
+ tool_calls = None
261
+ message = ChatCompletionMessage(role='assistant', content=content, tool_calls=tool_calls)
262
+ oai_choices.append(
263
+ Choice(
264
+ finish_reason=openai_finish_reason(choice.stop_reason),
265
+ index=index,
266
+ message=message,
267
+ logprobs=ChoiceLogprobs(**choice.logprobs.model_dump()) if choice.logprobs is not None else None,
268
+ )
269
+ )
270
+
271
+ return oai_choices
272
+
273
+
274
+ def openai_completion_usage(usage: ModelUsage) -> CompletionUsage:
275
+ return CompletionUsage(
276
+ completion_tokens=usage.output_tokens,
277
+ prompt_tokens=usage.input_tokens,
278
+ total_tokens=usage.total_tokens,
279
+ )
280
+
281
+
282
+ def openai_finish_reason(
283
+ stop_reason: StopReason
284
+ ) -> Literal['stop', 'length', 'tool_calls', 'content_filter', 'function_call']:
285
+ if stop_reason in ('stop', 'tool_calls', 'content_filter'):
286
+ return stop_reason
287
+ elif stop_reason == 'model_length':
288
+ return 'length'
289
+ else:
290
+ return 'stop'
291
+
292
+
293
+ def openai_chat_tool_param(tool: ToolInfo) -> ChatCompletionToolParam:
294
+ function = FunctionDefinition(
295
+ name=tool.name,
296
+ description=tool.description,
297
+ parameters=tool.parameters.model_dump(exclude_none=True),
298
+ )
299
+ return ChatCompletionToolParam(type='function', function=function)
300
+
301
+
302
+ def openai_chat_tools(tools: List[ToolInfo]) -> List[ChatCompletionToolParam]:
303
+ return [openai_chat_tool_param(tool) for tool in tools]
304
+
305
+
306
+ def openai_chat_tool_choice(tool_choice: ToolChoice, ) -> ChatCompletionToolChoiceOptionParam:
307
+ if isinstance(tool_choice, ToolFunction):
308
+ return ChatCompletionNamedToolChoiceParam(type='function', function=dict(name=tool_choice.name))
309
+ # openai supports 'any' via the 'required' keyword
310
+ elif tool_choice == 'any':
311
+ return 'required'
312
+ else:
313
+ return tool_choice
314
+
315
+
316
+ def chat_tool_calls_from_openai(message: ChatCompletionMessage, tools: List[ToolInfo]) -> Optional[List[ToolCall]]:
317
+ if message.tool_calls:
318
+ return [
319
+ parse_tool_call(call.id, call.function.name, call.function.arguments, tools) for call in message.tool_calls
320
+ ]
321
+ else:
322
+ return None
323
+
324
+
325
+ def chat_messages_from_openai(
326
+ model: str,
327
+ messages: List[ChatCompletionMessageParam],
328
+ ) -> List[ChatMessage]:
329
+ # track tool names by id
330
+ tool_names: Dict[str, str] = {}
331
+
332
+ chat_messages: List[ChatMessage] = []
333
+
334
+ for message in messages:
335
+ content: Union[str, List[Content]] = []
336
+ if message['role'] == 'system' or message['role'] == 'developer':
337
+ sys_content = message['content']
338
+ if isinstance(sys_content, str):
339
+ chat_messages.append(ChatMessageSystem(content=sys_content))
340
+ else:
341
+ content = []
342
+ for sc in sys_content:
343
+ content.extend(content_from_openai(sc))
344
+ chat_messages.append(ChatMessageSystem(content=content))
345
+ elif message['role'] == 'user':
346
+ user_content = message['content']
347
+ if isinstance(user_content, str):
348
+ chat_messages.append(ChatMessageUser(content=user_content))
349
+ else:
350
+ content = []
351
+ for uc in user_content:
352
+ content.extend(content_from_openai(uc))
353
+ chat_messages.append(ChatMessageUser(content=content))
354
+ elif message['role'] == 'assistant':
355
+ # resolve content
356
+ refusal: Optional[Literal[True]] = None
357
+ internal: Optional[JsonValue] = None
358
+ asst_content = message.get('content', None)
359
+ if isinstance(asst_content, str):
360
+ # Even though the choices API doesn't take advantage of .internal,
361
+ # we could be transforming from OpenAI choices to Inspect for agent
362
+ # bridge scenarios where a different model (that does use .internal)
363
+ # is the actual model being used.
364
+ asst_content, internal = _parse_content_with_internal(asst_content)
365
+ asst_content, smuggled_reasoning = parse_content_with_reasoning(asst_content)
366
+ if smuggled_reasoning:
367
+ content = [
368
+ smuggled_reasoning,
369
+ ContentText(text=asst_content),
370
+ ]
371
+ else:
372
+ content = asst_content
373
+ elif asst_content is None:
374
+ content = message.get('refusal', None) or ''
375
+ if content:
376
+ refusal = True
377
+ else:
378
+ content = []
379
+ for ac in asst_content:
380
+ content.extend(content_from_openai(ac, parse_reasoning=True))
381
+
382
+ # resolve reasoning (OpenAI doesn't suport this however OpenAI-compatible
383
+ # interfaces e.g. DeepSeek do include this field so we pluck it out)
384
+ reasoning = message.get('reasoning_content', None) or message.get('reasoning', None)
385
+ if reasoning is not None:
386
+ # normalize content to an array
387
+ if isinstance(content, str):
388
+ content = [ContentText(text=content, refusal=refusal)]
389
+
390
+ # insert reasoning
391
+ content.insert(0, ContentReasoning(reasoning=str(reasoning)))
392
+
393
+ # return message
394
+ if 'tool_calls' in message:
395
+ tool_calls: List[ToolCall] = []
396
+ for call in message['tool_calls']:
397
+ tool_calls.append(tool_call_from_openai(call))
398
+ tool_names[call['id']] = call['function']['name']
399
+
400
+ else:
401
+ tool_calls = []
402
+
403
+ chat_messages.append(
404
+ ChatMessageAssistant(
405
+ content=content,
406
+ tool_calls=tool_calls or None,
407
+ model=model,
408
+ source='generate',
409
+ internal=internal,
410
+ )
411
+ )
412
+ elif message['role'] == 'tool':
413
+ tool_content = message.get('content', None) or ''
414
+ if isinstance(tool_content, str):
415
+ # If tool_content is a simple str, it could be the result of some
416
+ # sub-agent tool call that has <think> or <internal> smuggled inside
417
+ # of it to support agent bridge scenarios. We have to strip that
418
+ # data. To be clear, if it's <think>, we'll strip the <think> tag,
419
+ # but the reasoning summary itself will remain in the content.
420
+ content, _ = _parse_content_with_internal(tool_content)
421
+ content, _ = parse_content_with_reasoning(content)
422
+ else:
423
+ content = []
424
+ for tc in tool_content:
425
+ content.extend(content_from_openai(tc))
426
+ chat_messages.append(
427
+ ChatMessageTool(
428
+ content=content,
429
+ tool_call_id=message['tool_call_id'],
430
+ function=tool_names.get(message['tool_call_id'], ''),
431
+ )
432
+ )
433
+ else:
434
+ raise ValueError(f'Unexpected message param type: {type(message)}')
435
+
436
+ return chat_messages
437
+
438
+
439
+ def tool_call_from_openai(tool_call: ChatCompletionMessageToolCallParam) -> ToolCall:
440
+ return parse_tool_call(
441
+ tool_call['id'],
442
+ tool_call['function']['name'],
443
+ tool_call['function']['arguments'],
444
+ )
445
+
446
+
447
+ def content_from_openai(
448
+ content: Union[ChatCompletionContentPartParam, ChatCompletionContentPartRefusalParam],
449
+ parse_reasoning: bool = False,
450
+ ) -> List[Content]:
451
+ # Some providers omit the type tag and use "object-with-a-single-field" encoding
452
+ if 'type' not in content and len(content) == 1:
453
+ content['type'] = list(content.keys())[0] # type: ignore[arg-type]
454
+ if content['type'] == 'text':
455
+ text = content['text']
456
+ if parse_reasoning:
457
+ content_text, content_reasoning = parse_content_with_reasoning(text)
458
+ if content_reasoning:
459
+ return [
460
+ content_reasoning,
461
+ ContentText(text=content_text),
462
+ ]
463
+ else:
464
+ return [ContentText(text=text)]
465
+ else:
466
+ return [ContentText(text=text)]
467
+ elif content['type'] == 'reasoning': # type: ignore[comparison-overlap]
468
+ return [ContentReasoning(reasoning=content['reasoning'])]
469
+ elif content['type'] == 'image_url':
470
+ return [ContentImage(image=content['image_url']['url'], detail=content['image_url']['detail'])]
471
+ elif content['type'] == 'input_audio':
472
+ return [ContentAudio(
473
+ audio=content['input_audio']['data'],
474
+ format=content['input_audio']['format'],
475
+ )]
476
+ elif content['type'] == 'refusal':
477
+ return [ContentText(text=content['refusal'], refusal=True)]
478
+ else:
479
+ content_type = content['type']
480
+ raise ValueError(f"Unexpected content type '{content_type}' in message.")
481
+
482
+
483
+ def chat_message_assistant_from_openai(
484
+ model: str, message: ChatCompletionMessage, tools: List[ToolInfo]
485
+ ) -> ChatMessageAssistant:
486
+ refusal = getattr(message, 'refusal', None)
487
+ reasoning = getattr(message, 'reasoning_content', None) or getattr(message, 'reasoning', None)
488
+
489
+ msg_content = refusal or message.content or ''
490
+ if reasoning is not None:
491
+ content: Union[str, List[Content]] = [
492
+ ContentReasoning(reasoning=str(reasoning)),
493
+ ContentText(text=msg_content, refusal=True if refusal else None),
494
+ ]
495
+ elif refusal is not None:
496
+ content = [ContentText(text=msg_content, refusal=True)]
497
+ else:
498
+ content = msg_content
499
+
500
+ return ChatMessageAssistant(
501
+ content=content,
502
+ model=model,
503
+ source='generate',
504
+ tool_calls=chat_tool_calls_from_openai(message, tools),
505
+ )
506
+
507
+
508
+ def model_output_from_openai(
509
+ completion: ChatCompletion,
510
+ choices: list[ChatCompletionChoice],
511
+ ) -> ModelOutput:
512
+ return ModelOutput(
513
+ model=completion.model,
514
+ choices=choices,
515
+ usage=(
516
+ ModelUsage(
517
+ input_tokens=completion.usage.prompt_tokens,
518
+ output_tokens=completion.usage.completion_tokens,
519
+ input_tokens_cache_read=(
520
+ completion.usage.prompt_tokens_details.cached_tokens if completion.usage.prompt_tokens_details
521
+ is not None else None # openai only have cache read stats/pricing.
522
+ ),
523
+ reasoning_tokens=(
524
+ completion.usage.completion_tokens_details.reasoning_tokens
525
+ if completion.usage.completion_tokens_details is not None else None
526
+ ),
527
+ total_tokens=completion.usage.total_tokens,
528
+ ) if completion.usage else None
529
+ ),
530
+ )
531
+
532
+
533
+ def chat_choices_from_openai(response: ChatCompletion, tools: List[ToolInfo]) -> List[ChatCompletionChoice]:
534
+ choices = list(response.choices)
535
+ choices.sort(key=lambda c: c.index)
536
+ return [
537
+ ChatCompletionChoice(
538
+ message=chat_message_assistant_from_openai(response.model, choice.message, tools),
539
+ stop_reason=as_stop_reason(choice.finish_reason),
540
+ logprobs=(
541
+ Logprobs(**choice.logprobs.model_dump())
542
+ if choice.logprobs and choice.logprobs.content is not None else None
543
+ ),
544
+ ) for choice in choices
545
+ ]
546
+
547
+
548
+ def openai_handle_bad_request(model_name: str, e: APIStatusError) -> Union[ModelOutput, Exception]:
549
+ # extract message
550
+ if isinstance(e.body, dict) and 'message' in e.body.keys():
551
+ content = str(e.body.get('message'))
552
+ else:
553
+ content = e.message
554
+
555
+ # narrow stop_reason
556
+ stop_reason: Optional[StopReason] = None
557
+ if e.code == 'context_length_exceeded':
558
+ stop_reason = 'model_length'
559
+ elif (
560
+ e.code == 'invalid_prompt' # seems to happen for o1/o3
561
+ or e.code == 'content_policy_violation' # seems to happen for vision
562
+ or e.code == 'content_filter' # seems to happen on azure
563
+ ):
564
+ stop_reason = 'content_filter'
565
+
566
+ if stop_reason:
567
+ return ModelOutput.from_content(model=model_name, content=content, stop_reason=stop_reason)
568
+ else:
569
+ raise e
570
+
571
+
572
+ def openai_media_filter(key: Optional[JsonValue], value: JsonValue) -> JsonValue:
573
+ # remove images from raw api call
574
+ if key == 'output' and isinstance(value, dict) and 'image_url' in value:
575
+ value = copy(value)
576
+ value.update(image_url=BASE_64_DATA_REMOVED)
577
+ if key == 'image_url' and isinstance(value, dict) and 'url' in value:
578
+ url = str(value.get('url'))
579
+ if url.startswith('data:'):
580
+ value = copy(value)
581
+ value.update(url=BASE_64_DATA_REMOVED)
582
+ elif key == 'input_audio' and isinstance(value, dict) and 'data' in value:
583
+ value = copy(value)
584
+ value.update(data=BASE_64_DATA_REMOVED)
585
+ return value
586
+
587
+
588
+ def _parse_content_with_internal(content: str, ) -> Tuple[str, Optional[JsonValue]]:
589
+ """
590
+ Extracts and removes a smuggled <internal>...</internal> tag from the content string, if present.
591
+
592
+ Note:
593
+ This OpenAI model does not natively use `.internal`. However, in bridge
594
+ scenarios—where output from a model that does use `.internal` is routed
595
+ through this code—such a tag may be present and should be handled.
596
+
597
+ Args:
598
+ content: The input string, possibly containing an <internal> tag with
599
+ base64-encoded JSON.
600
+
601
+ Returns:
602
+ tuple[str, JsonValue | None]:
603
+ - The content string with the <internal>...</internal> tag removed (if present), otherwise the original string.
604
+ - The decoded and parsed internal value (if present), otherwise None.
605
+
606
+ Raises:
607
+ json.JSONDecodeError: If the content of the <internal> tag is not valid JSON after decoding.
608
+ UnicodeDecodeError: If the content of the <internal> tag is not valid UTF-8 after base64 decoding.
609
+ """ # noqa: E501
610
+ internal_pattern = r'<internal>(.*?)</internal>'
611
+ internal_match = re.search(r'<internal>(.*?)</internal>', content, re.DOTALL)
612
+
613
+ return ((
614
+ re.sub(internal_pattern, '', content, flags=re.DOTALL).strip(),
615
+ json.loads(base64.b64decode(internal_match.group(1)).decode('utf-8')),
616
+ ) if internal_match else (content, None))
617
+
618
+
619
+ def collect_stream_response(response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
620
+ collected_chunks: List[ChatCompletionChunk] = []
621
+ collected_messages = defaultdict(list)
622
+ collected_reasoning = defaultdict(list)
623
+ collected_tool_calls = defaultdict(dict)
624
+
625
+ for chunk in response_stream:
626
+ collected_chunks.append(chunk)
627
+ for choice in chunk.choices:
628
+ # Handle reasoning content
629
+ if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
630
+ collected_reasoning[choice.index].append(choice.delta.reasoning_content)
631
+
632
+ # Handle regular content
633
+ if choice.delta.content is not None:
634
+ collected_messages[choice.index].append(choice.delta.content)
635
+
636
+ # Handle tool calls
637
+ if hasattr(choice.delta, 'tool_calls') and choice.delta.tool_calls:
638
+ for tool_call in choice.delta.tool_calls:
639
+ tool_id = tool_call.index
640
+
641
+ # Initialize tool call if not present
642
+ if tool_id not in collected_tool_calls[choice.index]:
643
+ collected_tool_calls[choice.index][tool_id] = {
644
+ 'id': tool_call.id if hasattr(tool_call, 'id') and tool_call.id else None,
645
+ 'type': tool_call.type if hasattr(tool_call, 'type') and tool_call.type else None,
646
+ 'function': {
647
+ 'name': '',
648
+ 'arguments': ''
649
+ }
650
+ }
651
+
652
+ # Update tool call with new chunks
653
+ if hasattr(tool_call, 'function'):
654
+ if hasattr(tool_call.function, 'name') and tool_call.function.name:
655
+ collected_tool_calls[choice.index][tool_id]['function']['name'] = tool_call.function.name
656
+
657
+ if hasattr(tool_call.function, 'arguments') and tool_call.function.arguments:
658
+ collected_tool_calls[choice.index
659
+ ][tool_id]['function']['arguments'] += tool_call.function.arguments
660
+
661
+ # Update ID if it was received later
662
+ if hasattr(tool_call, 'id') and tool_call.id:
663
+ collected_tool_calls[choice.index][tool_id]['id'] = tool_call.id
664
+
665
+ # Get all unique choice indices from all collections
666
+ all_indices = set(collected_messages.keys()) | set(collected_reasoning.keys()) | set(collected_tool_calls.keys())
667
+
668
+ choices = []
669
+ for index in all_indices:
670
+ full_reply_content = ''.join(collected_messages.get(index, []))
671
+ reasoning = ''.join(collected_reasoning.get(index, []))
672
+
673
+ # Process tool_calls for this choice if any exists
674
+ tool_calls_list = None
675
+ if index in collected_tool_calls and collected_tool_calls[index]:
676
+ tool_calls_list = list(collected_tool_calls[index].values())
677
+ # Filter out any tool calls with None id (incomplete tool calls)
678
+ tool_calls_list = [tc for tc in tool_calls_list if tc['id'] is not None]
679
+
680
+ # use the finish_reason from the last chunk that generated this choice
681
+ finish_reason = None
682
+ for chunk in reversed(collected_chunks):
683
+ if chunk.choices and chunk.choices[0].index == index:
684
+ finish_reason = chunk.choices[0].finish_reason
685
+ break
686
+
687
+ message_kwargs = {'role': 'assistant', 'content': full_reply_content}
688
+
689
+ if reasoning:
690
+ message_kwargs['reasoning_content'] = reasoning
691
+
692
+ if tool_calls_list:
693
+ message_kwargs['tool_calls'] = tool_calls_list
694
+
695
+ choice = Choice(
696
+ finish_reason=finish_reason or 'stop', index=index, message=ChatCompletionMessage(**message_kwargs)
697
+ )
698
+ choices.append(choice)
699
+
700
+ # build the final completion object
701
+ return ChatCompletion(
702
+ id=collected_chunks[0].id,
703
+ choices=choices,
704
+ created=collected_chunks[0].created,
705
+ model=collected_chunks[0].model,
706
+ object='chat.completion',
707
+ usage=collected_chunks[-1].usage # use the usage from the last chunk
708
+ )