evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1,701 @@
1
+ import base64
2
+ import json
3
+ import re
4
+ from collections import defaultdict
5
+ from copy import copy
6
+ from openai import APIStatusError, OpenAIError
7
+ from openai.types.chat import (
8
+ ChatCompletion,
9
+ ChatCompletionAssistantMessageParam,
10
+ ChatCompletionChunk,
11
+ ChatCompletionContentPartImageParam,
12
+ ChatCompletionContentPartInputAudioParam,
13
+ ChatCompletionContentPartParam,
14
+ ChatCompletionContentPartRefusalParam,
15
+ ChatCompletionContentPartTextParam,
16
+ ChatCompletionDeveloperMessageParam,
17
+ ChatCompletionMessage,
18
+ ChatCompletionMessageParam,
19
+ ChatCompletionMessageToolCall,
20
+ ChatCompletionMessageToolCallParam,
21
+ ChatCompletionNamedToolChoiceParam,
22
+ ChatCompletionSystemMessageParam,
23
+ ChatCompletionToolChoiceOptionParam,
24
+ ChatCompletionToolMessageParam,
25
+ ChatCompletionToolParam,
26
+ ChatCompletionUserMessageParam,
27
+ )
28
+ from openai.types.chat.chat_completion import Choice, ChoiceLogprobs
29
+ from openai.types.chat.chat_completion_message_tool_call import Function
30
+ from openai.types.completion_usage import CompletionUsage
31
+ from openai.types.shared_params.function_definition import FunctionDefinition
32
+ from pydantic import JsonValue
33
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Union
34
+
35
+ from evalscope.api.messages import (
36
+ ChatMessage,
37
+ ChatMessageAssistant,
38
+ ChatMessageSystem,
39
+ ChatMessageTool,
40
+ ChatMessageUser,
41
+ Content,
42
+ ContentAudio,
43
+ ContentImage,
44
+ ContentReasoning,
45
+ ContentText,
46
+ parse_content_with_reasoning,
47
+ )
48
+ from evalscope.api.model import (
49
+ ChatCompletionChoice,
50
+ GenerateConfig,
51
+ Logprobs,
52
+ ModelOutput,
53
+ ModelUsage,
54
+ StopReason,
55
+ as_stop_reason,
56
+ )
57
+ from evalscope.api.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo, parse_tool_call
58
+ from evalscope.utils.url_utils import file_as_data_uri, is_http_url
59
+
60
+ BASE_64_DATA_REMOVED = '<base64-data-removed>'
61
+
62
+
63
+ class OpenAIResponseError(OpenAIError):
64
+
65
+ def __init__(self, code: str, message: str) -> None:
66
+ self.code = code
67
+ self.message = message
68
+
69
+ def __str__(self) -> str:
70
+ return f'{self.code}: {self.message}'
71
+
72
+
73
+ def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
74
+ return ChatCompletionMessageToolCall(
75
+ type='function',
76
+ id=tool_call.id,
77
+ function=Function(name=tool_call.function.name, arguments=json.dumps(tool_call.function.arguments)),
78
+ )
79
+
80
+
81
+ def openai_chat_tool_call_param(tool_call: ToolCall) -> ChatCompletionMessageToolCallParam:
82
+ return ChatCompletionMessageToolCallParam(
83
+ id=tool_call.id,
84
+ function=dict(name=tool_call.function.name, arguments=json.dumps(tool_call.function.arguments)),
85
+ type='function',
86
+ )
87
+
88
+
89
+ def openai_chat_completion_part(content: Content) -> ChatCompletionContentPartParam:
90
+ if content.type == 'text':
91
+ return ChatCompletionContentPartTextParam(type='text', text=content.text)
92
+ elif content.type == 'image':
93
+ # API takes URL or base64 encoded file. If it's a remote file or
94
+ # data URL leave it alone, otherwise encode it
95
+ image_url = content.image
96
+ detail = content.detail
97
+
98
+ if not is_http_url(image_url):
99
+ image_url = file_as_data_uri(image_url)
100
+
101
+ return ChatCompletionContentPartImageParam(
102
+ type='image_url',
103
+ image_url=dict(url=image_url, detail=detail),
104
+ )
105
+ elif content.type == 'audio':
106
+ audio_data_uri = file_as_data_uri(content.audio)
107
+ audio_data = audio_data_uri.split('base64,')[1]
108
+
109
+ return ChatCompletionContentPartInputAudioParam(
110
+ type='input_audio', input_audio=dict(data=audio_data, format=content.format)
111
+ )
112
+
113
+ else:
114
+ raise RuntimeError('Video content is not currently supported by Open AI chat models.')
115
+
116
+
117
+ def openai_chat_message(
118
+ message: ChatMessage, system_role: Literal['user', 'system', 'developer'] = 'system'
119
+ ) -> ChatCompletionMessageParam:
120
+ if message.role == 'system':
121
+ if system_role == 'user':
122
+ return ChatCompletionUserMessageParam(role='user', content=message.text)
123
+ elif system_role == 'system':
124
+ return ChatCompletionSystemMessageParam(role=message.role, content=message.text)
125
+ elif system_role == 'developer':
126
+ return ChatCompletionDeveloperMessageParam(role='developer', content=message.text)
127
+ elif message.role == 'user':
128
+ return ChatCompletionUserMessageParam(
129
+ role=message.role,
130
+ content=(
131
+ message.content if isinstance(message.content, str) else
132
+ [openai_chat_completion_part(content) for content in message.content]
133
+ ),
134
+ )
135
+ elif message.role == 'assistant':
136
+ if message.tool_calls:
137
+ return ChatCompletionAssistantMessageParam(
138
+ role=message.role,
139
+ content=openai_assistant_content(message),
140
+ tool_calls=[openai_chat_tool_call_param(call) for call in message.tool_calls],
141
+ )
142
+ else:
143
+ return ChatCompletionAssistantMessageParam(role=message.role, content=openai_assistant_content(message))
144
+ elif message.role == 'tool':
145
+ return ChatCompletionToolMessageParam(
146
+ role=message.role,
147
+ content=(f'Error: {message.error.message}' if message.error else message.text),
148
+ tool_call_id=str(message.tool_call_id),
149
+ )
150
+ else:
151
+ raise ValueError(f'Unexpected message role {message.role}')
152
+
153
+
154
+ def openai_chat_messages(
155
+ messages: List[ChatMessage],
156
+ system_role: Literal['user', 'system', 'developer'] = 'system',
157
+ ) -> List[ChatCompletionMessageParam]:
158
+ return [openai_chat_message(message, system_role) for message in messages]
159
+
160
+
161
+ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) -> Dict[str, Any]:
162
+ params: Dict[str, Any] = dict(model=model)
163
+ # handle stream option
164
+ if config.stream is not None:
165
+ params['stream'] = config.stream
166
+ if config.stream:
167
+ params['stream_options'] = {'include_usage': True}
168
+ if config.timeout is not None:
169
+ params['timeout'] = config.timeout
170
+ if config.max_tokens is not None:
171
+ params['max_tokens'] = config.max_tokens
172
+ if config.frequency_penalty is not None:
173
+ params['frequency_penalty'] = config.frequency_penalty
174
+ if config.stop_seqs is not None:
175
+ params['stop'] = config.stop_seqs
176
+ if config.presence_penalty is not None:
177
+ params['presence_penalty'] = config.presence_penalty
178
+ if config.logit_bias is not None:
179
+ params['logit_bias'] = config.logit_bias
180
+ if config.seed is not None:
181
+ params['seed'] = config.seed
182
+ if config.temperature is not None:
183
+ params['temperature'] = config.temperature
184
+ if config.top_p is not None:
185
+ params['top_p'] = config.top_p
186
+ if config.n is not None:
187
+ params['n'] = config.n
188
+ if config.logprobs is not None:
189
+ params['logprobs'] = config.logprobs
190
+ if config.top_logprobs is not None:
191
+ params['top_logprobs'] = config.top_logprobs
192
+ if tools and config.parallel_tool_calls is not None:
193
+ params['parallel_tool_calls'] = config.parallel_tool_calls
194
+ if config.reasoning_effort is not None:
195
+ params['reasoning_effort'] = config.reasoning_effort
196
+ if config.response_schema is not None:
197
+ params['response_format'] = dict(
198
+ type='json_schema',
199
+ json_schema=dict(
200
+ name=config.response_schema.name,
201
+ schema=config.response_schema.json_schema.model_dump(exclude_none=True),
202
+ description=config.response_schema.description,
203
+ strict=config.response_schema.strict,
204
+ ),
205
+ )
206
+ if config.extra_body:
207
+ params['extra_body'] = config.extra_body
208
+
209
+ return params
210
+
211
+
212
+ def openai_assistant_content(message: ChatMessageAssistant, include_reasoning=True) -> str:
213
+ # In agent bridge scenarios, we could encounter concepts such as reasoning and
214
+ # .internal use in the ChatMessageAssistant that are not supported by the OpenAI
215
+ # choices API. This code smuggles that data into the plain text so that it
216
+ # survives multi-turn round trips.
217
+
218
+ if isinstance(message.content, str):
219
+ content = message.content
220
+ else:
221
+ content = ''
222
+ for c in message.content:
223
+ if c.type == 'reasoning' and include_reasoning:
224
+ attribs = ''
225
+ if c.signature is not None:
226
+ attribs = f'{attribs} signature="{c.signature}"'
227
+ if c.redacted:
228
+ attribs = f'{attribs} redacted="true"'
229
+ content = f'{content}\n<think{attribs}>\n{c.reasoning}\n</think>\n'
230
+ elif c.type == 'text':
231
+ content = f'{content}\n{c.text}'
232
+
233
+ if message.internal:
234
+ content = f"""{content}\n<internal>{
235
+ base64.b64encode(json.dumps(message.internal).encode("utf-8")).decode(
236
+ "utf-8"
237
+ )
238
+ }</internal>\n"""
239
+ return content
240
+
241
+
242
+ def openai_chat_choices(choices: List[ChatCompletionChoice], include_reasoning: bool = True) -> List[Choice]:
243
+ oai_choices: List[Choice] = []
244
+
245
+ for index, choice in enumerate(choices):
246
+ # Handle content
247
+ content = openai_assistant_content(choice.message, include_reasoning=include_reasoning)
248
+
249
+ # Handle tool calls
250
+ if choice.message.tool_calls:
251
+ tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
252
+ else:
253
+ tool_calls = None
254
+ message = ChatCompletionMessage(role='assistant', content=content, tool_calls=tool_calls)
255
+ oai_choices.append(
256
+ Choice(
257
+ finish_reason=openai_finish_reason(choice.stop_reason),
258
+ index=index,
259
+ message=message,
260
+ logprobs=ChoiceLogprobs(**choice.logprobs.model_dump()) if choice.logprobs is not None else None,
261
+ )
262
+ )
263
+
264
+ return oai_choices
265
+
266
+
267
+ def openai_completion_usage(usage: ModelUsage) -> CompletionUsage:
268
+ return CompletionUsage(
269
+ completion_tokens=usage.output_tokens,
270
+ prompt_tokens=usage.input_tokens,
271
+ total_tokens=usage.total_tokens,
272
+ )
273
+
274
+
275
+ def openai_finish_reason(
276
+ stop_reason: StopReason
277
+ ) -> Literal['stop', 'length', 'tool_calls', 'content_filter', 'function_call']:
278
+ if stop_reason in ('stop', 'tool_calls', 'content_filter'):
279
+ return stop_reason
280
+ elif stop_reason == 'model_length':
281
+ return 'length'
282
+ else:
283
+ return 'stop'
284
+
285
+
286
+ def openai_chat_tool_param(tool: ToolInfo) -> ChatCompletionToolParam:
287
+ function = FunctionDefinition(
288
+ name=tool.name,
289
+ description=tool.description,
290
+ parameters=tool.parameters.model_dump(exclude_none=True),
291
+ )
292
+ return ChatCompletionToolParam(type='function', function=function)
293
+
294
+
295
+ def openai_chat_tools(tools: List[ToolInfo]) -> List[ChatCompletionToolParam]:
296
+ return [openai_chat_tool_param(tool) for tool in tools]
297
+
298
+
299
+ def openai_chat_tool_choice(tool_choice: ToolChoice, ) -> ChatCompletionToolChoiceOptionParam:
300
+ if isinstance(tool_choice, ToolFunction):
301
+ return ChatCompletionNamedToolChoiceParam(type='function', function=dict(name=tool_choice.name))
302
+ # openai supports 'any' via the 'required' keyword
303
+ elif tool_choice == 'any':
304
+ return 'required'
305
+ else:
306
+ return tool_choice
307
+
308
+
309
+ def chat_tool_calls_from_openai(message: ChatCompletionMessage, tools: List[ToolInfo]) -> Optional[List[ToolCall]]:
310
+ if message.tool_calls:
311
+ return [
312
+ parse_tool_call(call.id, call.function.name, call.function.arguments, tools) for call in message.tool_calls
313
+ ]
314
+ else:
315
+ return None
316
+
317
+
318
+ def chat_messages_from_openai(
319
+ model: str,
320
+ messages: List[ChatCompletionMessageParam],
321
+ ) -> List[ChatMessage]:
322
+ # track tool names by id
323
+ tool_names: Dict[str, str] = {}
324
+
325
+ chat_messages: List[ChatMessage] = []
326
+
327
+ for message in messages:
328
+ content: Union[str, List[Content]] = []
329
+ if message['role'] == 'system' or message['role'] == 'developer':
330
+ sys_content = message['content']
331
+ if isinstance(sys_content, str):
332
+ chat_messages.append(ChatMessageSystem(content=sys_content))
333
+ else:
334
+ content = []
335
+ for sc in sys_content:
336
+ content.extend(content_from_openai(sc))
337
+ chat_messages.append(ChatMessageSystem(content=content))
338
+ elif message['role'] == 'user':
339
+ user_content = message['content']
340
+ if isinstance(user_content, str):
341
+ chat_messages.append(ChatMessageUser(content=user_content))
342
+ else:
343
+ content = []
344
+ for uc in user_content:
345
+ content.extend(content_from_openai(uc))
346
+ chat_messages.append(ChatMessageUser(content=content))
347
+ elif message['role'] == 'assistant':
348
+ # resolve content
349
+ refusal: Optional[Literal[True]] = None
350
+ internal: Optional[JsonValue] = None
351
+ asst_content = message.get('content', None)
352
+ if isinstance(asst_content, str):
353
+ # Even though the choices API doesn't take advantage of .internal,
354
+ # we could be transforming from OpenAI choices to Inspect for agent
355
+ # bridge scenarios where a different model (that does use .internal)
356
+ # is the actual model being used.
357
+ asst_content, internal = _parse_content_with_internal(asst_content)
358
+ asst_content, smuggled_reasoning = parse_content_with_reasoning(asst_content)
359
+ if smuggled_reasoning:
360
+ content = [
361
+ smuggled_reasoning,
362
+ ContentText(text=asst_content),
363
+ ]
364
+ else:
365
+ content = asst_content
366
+ elif asst_content is None:
367
+ content = message.get('refusal', None) or ''
368
+ if content:
369
+ refusal = True
370
+ else:
371
+ content = []
372
+ for ac in asst_content:
373
+ content.extend(content_from_openai(ac, parse_reasoning=True))
374
+
375
+ # resolve reasoning (OpenAI doesn't suport this however OpenAI-compatible
376
+ # interfaces e.g. DeepSeek do include this field so we pluck it out)
377
+ reasoning = message.get('reasoning_content', None) or message.get('reasoning', None)
378
+ if reasoning is not None:
379
+ # normalize content to an array
380
+ if isinstance(content, str):
381
+ content = [ContentText(text=content, refusal=refusal)]
382
+
383
+ # insert reasoning
384
+ content.insert(0, ContentReasoning(reasoning=str(reasoning)))
385
+
386
+ # return message
387
+ if 'tool_calls' in message:
388
+ tool_calls: List[ToolCall] = []
389
+ for call in message['tool_calls']:
390
+ tool_calls.append(tool_call_from_openai(call))
391
+ tool_names[call['id']] = call['function']['name']
392
+
393
+ else:
394
+ tool_calls = []
395
+
396
+ chat_messages.append(
397
+ ChatMessageAssistant(
398
+ content=content,
399
+ tool_calls=tool_calls or None,
400
+ model=model,
401
+ source='generate',
402
+ internal=internal,
403
+ )
404
+ )
405
+ elif message['role'] == 'tool':
406
+ tool_content = message.get('content', None) or ''
407
+ if isinstance(tool_content, str):
408
+ # If tool_content is a simple str, it could be the result of some
409
+ # sub-agent tool call that has <think> or <internal> smuggled inside
410
+ # of it to support agent bridge scenarios. We have to strip that
411
+ # data. To be clear, if it's <think>, we'll strip the <think> tag,
412
+ # but the reasoning summary itself will remain in the content.
413
+ content, _ = _parse_content_with_internal(tool_content)
414
+ content, _ = parse_content_with_reasoning(content)
415
+ else:
416
+ content = []
417
+ for tc in tool_content:
418
+ content.extend(content_from_openai(tc))
419
+ chat_messages.append(
420
+ ChatMessageTool(
421
+ content=content,
422
+ tool_call_id=message['tool_call_id'],
423
+ function=tool_names.get(message['tool_call_id'], ''),
424
+ )
425
+ )
426
+ else:
427
+ raise ValueError(f'Unexpected message param type: {type(message)}')
428
+
429
+ return chat_messages
430
+
431
+
432
+ def tool_call_from_openai(tool_call: ChatCompletionMessageToolCallParam) -> ToolCall:
433
+ return parse_tool_call(
434
+ tool_call['id'],
435
+ tool_call['function']['name'],
436
+ tool_call['function']['arguments'],
437
+ )
438
+
439
+
440
+ def content_from_openai(
441
+ content: Union[ChatCompletionContentPartParam, ChatCompletionContentPartRefusalParam],
442
+ parse_reasoning: bool = False,
443
+ ) -> List[Content]:
444
+ # Some providers omit the type tag and use "object-with-a-single-field" encoding
445
+ if 'type' not in content and len(content) == 1:
446
+ content['type'] = list(content.keys())[0] # type: ignore[arg-type]
447
+ if content['type'] == 'text':
448
+ text = content['text']
449
+ if parse_reasoning:
450
+ content_text, content_reasoning = parse_content_with_reasoning(text)
451
+ if content_reasoning:
452
+ return [
453
+ content_reasoning,
454
+ ContentText(text=content_text),
455
+ ]
456
+ else:
457
+ return [ContentText(text=text)]
458
+ else:
459
+ return [ContentText(text=text)]
460
+ elif content['type'] == 'reasoning': # type: ignore[comparison-overlap]
461
+ return [ContentReasoning(reasoning=content['reasoning'])]
462
+ elif content['type'] == 'image_url':
463
+ return [ContentImage(image=content['image_url']['url'], detail=content['image_url']['detail'])]
464
+ elif content['type'] == 'input_audio':
465
+ return [ContentAudio(
466
+ audio=content['input_audio']['data'],
467
+ format=content['input_audio']['format'],
468
+ )]
469
+ elif content['type'] == 'refusal':
470
+ return [ContentText(text=content['refusal'], refusal=True)]
471
+ else:
472
+ content_type = content['type']
473
+ raise ValueError(f"Unexpected content type '{content_type}' in message.")
474
+
475
+
476
+ def chat_message_assistant_from_openai(
477
+ model: str, message: ChatCompletionMessage, tools: List[ToolInfo]
478
+ ) -> ChatMessageAssistant:
479
+ refusal = getattr(message, 'refusal', None)
480
+ reasoning = getattr(message, 'reasoning_content', None) or getattr(message, 'reasoning', None)
481
+
482
+ msg_content = refusal or message.content or ''
483
+ if reasoning is not None:
484
+ content: Union[str, List[Content]] = [
485
+ ContentReasoning(reasoning=str(reasoning)),
486
+ ContentText(text=msg_content, refusal=True if refusal else None),
487
+ ]
488
+ elif refusal is not None:
489
+ content = [ContentText(text=msg_content, refusal=True)]
490
+ else:
491
+ content = msg_content
492
+
493
+ return ChatMessageAssistant(
494
+ content=content,
495
+ model=model,
496
+ source='generate',
497
+ tool_calls=chat_tool_calls_from_openai(message, tools),
498
+ )
499
+
500
+
501
+ def model_output_from_openai(
502
+ completion: ChatCompletion,
503
+ choices: list[ChatCompletionChoice],
504
+ ) -> ModelOutput:
505
+ return ModelOutput(
506
+ model=completion.model,
507
+ choices=choices,
508
+ usage=(
509
+ ModelUsage(
510
+ input_tokens=completion.usage.prompt_tokens,
511
+ output_tokens=completion.usage.completion_tokens,
512
+ input_tokens_cache_read=(
513
+ completion.usage.prompt_tokens_details.cached_tokens if completion.usage.prompt_tokens_details
514
+ is not None else None # openai only have cache read stats/pricing.
515
+ ),
516
+ reasoning_tokens=(
517
+ completion.usage.completion_tokens_details.reasoning_tokens
518
+ if completion.usage.completion_tokens_details is not None else None
519
+ ),
520
+ total_tokens=completion.usage.total_tokens,
521
+ ) if completion.usage else None
522
+ ),
523
+ )
524
+
525
+
526
+ def chat_choices_from_openai(response: ChatCompletion, tools: List[ToolInfo]) -> List[ChatCompletionChoice]:
527
+ choices = list(response.choices)
528
+ choices.sort(key=lambda c: c.index)
529
+ return [
530
+ ChatCompletionChoice(
531
+ message=chat_message_assistant_from_openai(response.model, choice.message, tools),
532
+ stop_reason=as_stop_reason(choice.finish_reason),
533
+ logprobs=(
534
+ Logprobs(**choice.logprobs.model_dump())
535
+ if choice.logprobs and choice.logprobs.content is not None else None
536
+ ),
537
+ ) for choice in choices
538
+ ]
539
+
540
+
541
+ def openai_handle_bad_request(model_name: str, e: APIStatusError) -> Union[ModelOutput, Exception]:
542
+ # extract message
543
+ if isinstance(e.body, dict) and 'message' in e.body.keys():
544
+ content = str(e.body.get('message'))
545
+ else:
546
+ content = e.message
547
+
548
+ # narrow stop_reason
549
+ stop_reason: Optional[StopReason] = None
550
+ if e.code == 'context_length_exceeded':
551
+ stop_reason = 'model_length'
552
+ elif (
553
+ e.code == 'invalid_prompt' # seems to happen for o1/o3
554
+ or e.code == 'content_policy_violation' # seems to happen for vision
555
+ or e.code == 'content_filter' # seems to happen on azure
556
+ ):
557
+ stop_reason = 'content_filter'
558
+
559
+ if stop_reason:
560
+ return ModelOutput.from_content(model=model_name, content=content, stop_reason=stop_reason)
561
+ else:
562
+ raise e
563
+
564
+
565
+ def openai_media_filter(key: Optional[JsonValue], value: JsonValue) -> JsonValue:
566
+ # remove images from raw api call
567
+ if key == 'output' and isinstance(value, dict) and 'image_url' in value:
568
+ value = copy(value)
569
+ value.update(image_url=BASE_64_DATA_REMOVED)
570
+ if key == 'image_url' and isinstance(value, dict) and 'url' in value:
571
+ url = str(value.get('url'))
572
+ if url.startswith('data:'):
573
+ value = copy(value)
574
+ value.update(url=BASE_64_DATA_REMOVED)
575
+ elif key == 'input_audio' and isinstance(value, dict) and 'data' in value:
576
+ value = copy(value)
577
+ value.update(data=BASE_64_DATA_REMOVED)
578
+ return value
579
+
580
+
581
+ def _parse_content_with_internal(content: str, ) -> Tuple[str, Optional[JsonValue]]:
582
+ """
583
+ Extracts and removes a smuggled <internal>...</internal> tag from the content string, if present.
584
+
585
+ Note:
586
+ This OpenAI model does not natively use `.internal`. However, in bridge
587
+ scenarios—where output from a model that does use `.internal` is routed
588
+ through this code—such a tag may be present and should be handled.
589
+
590
+ Args:
591
+ content: The input string, possibly containing an <internal> tag with
592
+ base64-encoded JSON.
593
+
594
+ Returns:
595
+ tuple[str, JsonValue | None]:
596
+ - The content string with the <internal>...</internal> tag removed (if present), otherwise the original string.
597
+ - The decoded and parsed internal value (if present), otherwise None.
598
+
599
+ Raises:
600
+ json.JSONDecodeError: If the content of the <internal> tag is not valid JSON after decoding.
601
+ UnicodeDecodeError: If the content of the <internal> tag is not valid UTF-8 after base64 decoding.
602
+ """ # noqa: E501
603
+ internal_pattern = r'<internal>(.*?)</internal>'
604
+ internal_match = re.search(r'<internal>(.*?)</internal>', content, re.DOTALL)
605
+
606
+ return ((
607
+ re.sub(internal_pattern, '', content, flags=re.DOTALL).strip(),
608
+ json.loads(base64.b64decode(internal_match.group(1)).decode('utf-8')),
609
+ ) if internal_match else (content, None))
610
+
611
+
612
+ def collect_stream_response(response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
613
+ collected_chunks: List[ChatCompletionChunk] = []
614
+ collected_messages = defaultdict(list)
615
+ collected_reasoning = defaultdict(list)
616
+ collected_tool_calls = defaultdict(dict)
617
+
618
+ for chunk in response_stream:
619
+ collected_chunks.append(chunk)
620
+ for choice in chunk.choices:
621
+ # Handle reasoning content
622
+ if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
623
+ collected_reasoning[choice.index].append(choice.delta.reasoning_content)
624
+
625
+ # Handle regular content
626
+ if choice.delta.content is not None:
627
+ collected_messages[choice.index].append(choice.delta.content)
628
+
629
+ # Handle tool calls
630
+ if hasattr(choice.delta, 'tool_calls') and choice.delta.tool_calls:
631
+ for tool_call in choice.delta.tool_calls:
632
+ tool_id = tool_call.index
633
+
634
+ # Initialize tool call if not present
635
+ if tool_id not in collected_tool_calls[choice.index]:
636
+ collected_tool_calls[choice.index][tool_id] = {
637
+ 'id': tool_call.id if hasattr(tool_call, 'id') and tool_call.id else None,
638
+ 'type': tool_call.type if hasattr(tool_call, 'type') and tool_call.type else None,
639
+ 'function': {
640
+ 'name': '',
641
+ 'arguments': ''
642
+ }
643
+ }
644
+
645
+ # Update tool call with new chunks
646
+ if hasattr(tool_call, 'function'):
647
+ if hasattr(tool_call.function, 'name') and tool_call.function.name:
648
+ collected_tool_calls[choice.index][tool_id]['function']['name'] = tool_call.function.name
649
+
650
+ if hasattr(tool_call.function, 'arguments') and tool_call.function.arguments:
651
+ collected_tool_calls[choice.index
652
+ ][tool_id]['function']['arguments'] += tool_call.function.arguments
653
+
654
+ # Update ID if it was received later
655
+ if hasattr(tool_call, 'id') and tool_call.id:
656
+ collected_tool_calls[choice.index][tool_id]['id'] = tool_call.id
657
+
658
+ # Get all unique choice indices from all collections
659
+ all_indices = set(collected_messages.keys()) | set(collected_reasoning.keys()) | set(collected_tool_calls.keys())
660
+
661
+ choices = []
662
+ for index in all_indices:
663
+ full_reply_content = ''.join(collected_messages.get(index, []))
664
+ reasoning = ''.join(collected_reasoning.get(index, []))
665
+
666
+ # Process tool_calls for this choice if any exists
667
+ tool_calls_list = None
668
+ if index in collected_tool_calls and collected_tool_calls[index]:
669
+ tool_calls_list = list(collected_tool_calls[index].values())
670
+ # Filter out any tool calls with None id (incomplete tool calls)
671
+ tool_calls_list = [tc for tc in tool_calls_list if tc['id'] is not None]
672
+
673
+ # use the finish_reason from the last chunk that generated this choice
674
+ finish_reason = None
675
+ for chunk in reversed(collected_chunks):
676
+ if chunk.choices and chunk.choices[0].index == index:
677
+ finish_reason = chunk.choices[0].finish_reason
678
+ break
679
+
680
+ message_kwargs = {'role': 'assistant', 'content': full_reply_content}
681
+
682
+ if reasoning:
683
+ message_kwargs['reasoning_content'] = reasoning
684
+
685
+ if tool_calls_list:
686
+ message_kwargs['tool_calls'] = tool_calls_list
687
+
688
+ choice = Choice(
689
+ finish_reason=finish_reason or 'stop', index=index, message=ChatCompletionMessage(**message_kwargs)
690
+ )
691
+ choices.append(choice)
692
+
693
+ # build the final completion object
694
+ return ChatCompletion(
695
+ id=collected_chunks[0].id,
696
+ choices=choices,
697
+ created=collected_chunks[0].created,
698
+ model=collected_chunks[0].model,
699
+ object='chat.completion',
700
+ usage=collected_chunks[-1].usage # use the usage from the last chunk
701
+ )