sglang 0.5.2rc2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -13,16 +13,15 @@
13
13
  # ==============================================================================
14
14
  """A controller that dispatches requests to multiple data parallel workers."""
15
15
 
16
+ import faulthandler
16
17
  import logging
17
18
  import multiprocessing as mp
18
19
  import signal
19
- import struct
20
- import sys
21
20
  import threading
22
21
  import time
22
+ from collections import deque
23
23
  from enum import Enum, auto
24
- from multiprocessing import shared_memory
25
- from typing import Dict, List
24
+ from typing import List
26
25
 
27
26
  import psutil
28
27
  import setproctitle
@@ -33,14 +32,19 @@ from sglang.srt.managers.io_struct import (
33
32
  BlockReqInput,
34
33
  TokenizedEmbeddingReqInput,
35
34
  TokenizedGenerateReqInput,
35
+ WatchLoadUpdateReq,
36
36
  )
37
37
  from sglang.srt.managers.schedule_batch import Req
38
38
  from sglang.srt.managers.scheduler import run_scheduler_process
39
- from sglang.srt.managers.utils import DPBalanceMeta
40
39
  from sglang.srt.server_args import PortArgs, ServerArgs
41
40
  from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
42
- from sglang.srt.utils import bind_port, configure_logger, get_zmq_socket
43
- from sglang.utils import get_exception_traceback
41
+ from sglang.srt.utils import (
42
+ bind_port,
43
+ configure_logger,
44
+ get_zmq_socket,
45
+ kill_itself_when_parent_died,
46
+ )
47
+ from sglang.utils import TypeBasedDispatcher, get_exception_traceback
44
48
 
45
49
  logger = logging.getLogger(__name__)
46
50
 
@@ -61,18 +65,48 @@ class LoadBalanceMethod(Enum):
61
65
  raise ValueError(f"Invalid load balance method: {method}") from exc
62
66
 
63
67
 
68
+ class DPBudget:
69
+ def __init__(self):
70
+ # TODO: support minimum tokens method
71
+ self.budget_queue = deque()
72
+
73
+ def update_budget(self, load_update: WatchLoadUpdateReq):
74
+ """Update the budget queue.
75
+ Use num_reqs instead of num_waiting_reqs to balance decode running batch.
76
+ """
77
+ loads = load_update.loads
78
+ self.budget_queue.clear()
79
+
80
+ num_reqs = [load.num_reqs for load in loads]
81
+ if not num_reqs:
82
+ return
83
+
84
+ max_num_reqs = max(num_reqs)
85
+ if all(x == max_num_reqs for x in num_reqs):
86
+ return
87
+
88
+ while any(x != num_reqs[0] for x in num_reqs):
89
+ min_load = min(num_reqs)
90
+ min_indices = [i for i, x in enumerate(num_reqs) if x == min_load]
91
+ second_min_load = min(x for x in num_reqs if x > min_load)
92
+ self.budget_queue.extend(
93
+ [loads[i].dp_rank for i in min_indices] * (second_min_load - min_load)
94
+ )
95
+ for idx in min_indices:
96
+ num_reqs[idx] = second_min_load
97
+
98
+ def dispatch(self):
99
+ if self.budget_queue:
100
+ return self.budget_queue.popleft()
101
+ return None
102
+
103
+
64
104
  class DataParallelController:
65
105
  """A controller that dispatches requests to multiple data parallel workers."""
66
106
 
67
- def __init__(
68
- self,
69
- server_args: ServerArgs,
70
- port_args: PortArgs,
71
- dp_balance_meta: DPBalanceMeta,
72
- ) -> None:
107
+ def __init__(self, server_args: ServerArgs, port_args: PortArgs) -> None:
73
108
  # for dp balance
74
109
  self.global_balance_id = 0
75
- self.balance_meta = dp_balance_meta
76
110
 
77
111
  # Parse args
78
112
  self.max_total_num_tokens = None
@@ -98,9 +132,12 @@ class DataParallelController:
98
132
  }
99
133
  self.dispatching = dispatch_lookup[self.load_balance_method]
100
134
 
135
+ # Load balance budget
136
+ self.dp_budget = DPBudget()
137
+
101
138
  # Launch data parallel workers
102
139
  self.scheduler_procs = []
103
- self.workers = [None] * server_args.dp_size
140
+ self.workers: List[zmq.Socket] = [None] * server_args.dp_size
104
141
 
105
142
  if server_args.enable_dp_attention:
106
143
  dp_port_args = self.launch_dp_attention_schedulers(server_args, port_args)
@@ -121,6 +158,31 @@ class DataParallelController:
121
158
 
122
159
  self.max_req_input_len = None
123
160
 
161
+ self.init_dispatcher()
162
+
163
+ def send_to_all_workers(self, obj):
164
+ for worker in self.workers:
165
+ worker.send_pyobj(obj)
166
+
167
+ def send_control_message(self, obj):
168
+ # Send control messages to first worker of tp group
169
+ for worker in self.workers[:: self.control_message_step]:
170
+ worker.send_pyobj(obj)
171
+
172
+ def handle_load_update_req(self, obj):
173
+ self.dp_budget.update_budget(obj)
174
+
175
+ def init_dispatcher(self):
176
+ self._request_dispatcher = TypeBasedDispatcher(
177
+ [
178
+ (TokenizedGenerateReqInput, self.dispatching),
179
+ (TokenizedEmbeddingReqInput, self.dispatching),
180
+ (BlockReqInput, self.send_to_all_workers),
181
+ (WatchLoadUpdateReq, self.handle_load_update_req),
182
+ ]
183
+ )
184
+ self._request_dispatcher.add_fallback_fn(self.send_control_message)
185
+
124
186
  def launch_dp_schedulers(self, server_args, port_args):
125
187
  base_gpu_id = 0
126
188
 
@@ -147,7 +209,9 @@ class DataParallelController:
147
209
  args=(server_args, tmp_port_args, base_gpu_id, dp_rank, ready_event),
148
210
  )
149
211
  threads.append(thread)
150
- base_gpu_id += server_args.tp_size * server_args.gpu_id_step
212
+ base_gpu_id += (
213
+ server_args.tp_size * server_args.pp_size * server_args.gpu_id_step
214
+ )
151
215
 
152
216
  # Free all sockets before starting the threads to launch TP workers
153
217
  for sock in sockets:
@@ -250,7 +314,6 @@ class DataParallelController:
250
314
  pp_rank,
251
315
  dp_rank,
252
316
  writer,
253
- self.balance_meta,
254
317
  ),
255
318
  )
256
319
  with memory_saver_adapter.configure_subprocess():
@@ -266,52 +329,43 @@ class DataParallelController:
266
329
  self.max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
267
330
  self.max_req_input_len = scheduler_info[0]["max_req_input_len"]
268
331
 
332
+ def maybe_external_dp_rank_routing(self, req: Req):
333
+ if req.data_parallel_rank is not None:
334
+ logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
335
+ self.workers[req.data_parallel_rank].send_pyobj(req)
336
+ return True
337
+ return False
338
+
269
339
  def round_robin_scheduler(self, req: Req):
340
+ if self.maybe_external_dp_rank_routing(req):
341
+ return
342
+
270
343
  if self.server_args.disaggregation_mode == "null":
271
- if req.data_parallel_rank is not None:
272
- logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
273
- self.workers[req.data_parallel_rank].send_pyobj(req)
274
- else:
275
- self.workers[self.round_robin_counter].send_pyobj(req)
276
- self.round_robin_counter = (self.round_robin_counter + 1) % len(
277
- self.workers
278
- )
344
+ self.workers[self.round_robin_counter].send_pyobj(req)
345
+ self.round_robin_counter = (self.round_robin_counter + 1) % len(
346
+ self.workers
347
+ )
279
348
  else:
280
- if req.data_parallel_rank is not None:
281
- logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
282
- self.workers[req.data_parallel_rank].send_pyobj(req)
283
- else:
284
- self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
285
-
286
- def shortest_queue_scheduler(self, input_requests):
287
- raise NotImplementedError()
349
+ self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
350
+
351
+ def shortest_queue_scheduler(self, req):
352
+ if self.maybe_external_dp_rank_routing(req):
353
+ return
354
+ target_worker = self.dp_budget.dispatch()
355
+ if target_worker is None:
356
+ self.round_robin_scheduler(req)
357
+ else:
358
+ self.workers[target_worker].send_pyobj(req)
288
359
 
289
360
  def minimum_tokens_scheduler(self, req):
290
- # This variable corresponds to the balance_id in TokenizedGenerateReqInput.
291
- # We use it to to control the number of onfly tokens (requests dispatched to workers but not yet received).
292
- def get_next_global_balance_id() -> int:
293
- INT32_MAX = 2147483647
294
- current_id = self.global_balance_id
295
- self.global_balance_id = (self.global_balance_id + 1) % INT32_MAX
296
- return current_id
297
-
298
- req.dp_balance_id = get_next_global_balance_id()
299
- with self.balance_meta.mutex:
300
- # 1. local_tokens represents the tokens currently inferring on the worker,
301
- # while onfly refers to the requests dispatched by the dispatcher but not yet received by the scheduler.
302
- onfly_info = self.balance_meta.get_shared_onfly()
303
- local_tokens = self.balance_meta.get_shared_local_tokens()
304
- total_tokens = [
305
- local_token + sum(onfly_dict.values())
306
- for local_token, onfly_dict in zip(local_tokens, onfly_info)
307
- ]
308
- target_worker = total_tokens.index(min(total_tokens))
309
- onfly_info[target_worker][req.dp_balance_id] = len(req.input_ids)
310
- # 2. write the new onfly info to the shm
311
- self.balance_meta.set_shared_onfly_info(onfly_info)
361
+ if self.maybe_external_dp_rank_routing(req):
362
+ return
312
363
 
313
- # logger.info(f"dp workers {local_tokens=}, {onfly_info=}, {target_worker=}")
314
- self.workers[target_worker].send_pyobj(req)
364
+ logger.warning(
365
+ "The 'minimum_tokens' load balancing method is deprecated for now and will introduced later."
366
+ "Fall back to 'round_robin_scheduler'"
367
+ )
368
+ self.round_robin_scheduler(req)
315
369
 
316
370
  def event_loop(self):
317
371
  while True:
@@ -320,22 +374,7 @@ class DataParallelController:
320
374
  recv_req = self.recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK)
321
375
  except zmq.ZMQError:
322
376
  break
323
-
324
- if isinstance(
325
- recv_req,
326
- (
327
- TokenizedGenerateReqInput,
328
- TokenizedEmbeddingReqInput,
329
- ),
330
- ):
331
- self.dispatching(recv_req)
332
- elif isinstance(recv_req, BlockReqInput):
333
- for worker in self.workers:
334
- worker.send_pyobj(recv_req)
335
- else:
336
- # Send other control messages to first worker of tp group
337
- for worker in self.workers[:: self.control_message_step]:
338
- worker.send_pyobj(recv_req)
377
+ self._request_dispatcher(recv_req)
339
378
 
340
379
 
341
380
  def run_data_parallel_controller_process(
@@ -343,15 +382,14 @@ def run_data_parallel_controller_process(
343
382
  port_args: PortArgs,
344
383
  pipe_writer,
345
384
  ):
385
+ kill_itself_when_parent_died()
346
386
  setproctitle.setproctitle("sglang::data_parallel_controller")
387
+ faulthandler.enable()
347
388
  configure_logger(server_args)
348
389
  parent_process = psutil.Process().parent()
349
- balance_meta = DPBalanceMeta(server_args.dp_size)
350
390
 
351
391
  try:
352
- controller = DataParallelController(
353
- server_args, port_args, dp_balance_meta=balance_meta
354
- )
392
+ controller = DataParallelController(server_args, port_args)
355
393
  pipe_writer.send(
356
394
  {
357
395
  "status": "ready",
@@ -370,6 +408,3 @@ def run_data_parallel_controller_process(
370
408
  traceback = get_exception_traceback()
371
409
  logger.error(f"DataParallelController hit an exception: {traceback}")
372
410
  parent_process.send_signal(signal.SIGQUIT)
373
- finally:
374
- # we need to destruct mp.Manager() in balance_meta
375
- balance_meta.destructor()
@@ -24,17 +24,16 @@ import psutil
24
24
  import setproctitle
25
25
  import zmq
26
26
 
27
- from sglang.srt.hf_transformers_utils import get_tokenizer
28
27
  from sglang.srt.managers.io_struct import (
29
- BatchEmbeddingOut,
28
+ BatchEmbeddingOutput,
30
29
  BatchMultimodalDecodeReq,
31
- BatchMultimodalOut,
32
- BatchStrOut,
33
- BatchTokenIDOut,
30
+ BatchMultimodalOutput,
31
+ BatchStrOutput,
32
+ BatchTokenIDOutput,
34
33
  FreezeGCReq,
35
34
  MultiTokenizerRegisterReq,
36
35
  )
37
- from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerMixin
36
+ from sglang.srt.managers.multi_tokenizer_mixin import MultiHttpWorkerDetokenizerMixin
38
37
  from sglang.srt.server_args import PortArgs, ServerArgs
39
38
  from sglang.srt.utils import (
40
39
  configure_logger,
@@ -42,6 +41,7 @@ from sglang.srt.utils import (
42
41
  get_zmq_socket,
43
42
  kill_itself_when_parent_died,
44
43
  )
44
+ from sglang.srt.utils.hf_transformers_utils import get_tokenizer
45
45
  from sglang.utils import (
46
46
  TypeBasedDispatcher,
47
47
  find_printable_text,
@@ -69,7 +69,7 @@ class DecodeStatus:
69
69
  sent_offset: int = 0
70
70
 
71
71
 
72
- class DetokenizerManager(MultiTokenizerMixin):
72
+ class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
73
73
  """DetokenizerManager is a process that detokenizes the token ids."""
74
74
 
75
75
  def __init__(
@@ -101,8 +101,8 @@ class DetokenizerManager(MultiTokenizerMixin):
101
101
 
102
102
  self._request_dispatcher = TypeBasedDispatcher(
103
103
  [
104
- (BatchEmbeddingOut, self.handle_batch_embedding_out),
105
- (BatchTokenIDOut, self.handle_batch_token_id_out),
104
+ (BatchEmbeddingOutput, self.handle_batch_embedding_out),
105
+ (BatchTokenIDOutput, self.handle_batch_token_id_out),
106
106
  (BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
107
107
  (MultiTokenizerRegisterReq, lambda x: x),
108
108
  (FreezeGCReq, self.handle_freeze_gc_req),
@@ -145,11 +145,11 @@ class DetokenizerManager(MultiTokenizerMixin):
145
145
  return output[:-1]
146
146
  return output
147
147
 
148
- def handle_batch_embedding_out(self, recv_obj: BatchEmbeddingOut):
148
+ def handle_batch_embedding_out(self, recv_obj: BatchEmbeddingOutput):
149
149
  # If it is embedding model, no detokenization is needed.
150
150
  return recv_obj
151
151
 
152
- def handle_batch_token_id_out(self, recv_obj: BatchTokenIDOut):
152
+ def handle_batch_token_id_out(self, recv_obj: BatchTokenIDOutput):
153
153
  bs = len(recv_obj.rids)
154
154
 
155
155
  # Initialize decode status
@@ -224,7 +224,7 @@ class DetokenizerManager(MultiTokenizerMixin):
224
224
  s.sent_offset = len(output_str)
225
225
  output_strs.append(incremental_output)
226
226
 
227
- return BatchStrOut(
227
+ return BatchStrOutput(
228
228
  rids=recv_obj.rids,
229
229
  finished_reasons=recv_obj.finished_reasons,
230
230
  output_strs=output_strs,
@@ -246,17 +246,21 @@ class DetokenizerManager(MultiTokenizerMixin):
246
246
  output_token_ids_logprobs_val=recv_obj.output_token_ids_logprobs_val,
247
247
  output_token_ids_logprobs_idx=recv_obj.output_token_ids_logprobs_idx,
248
248
  output_hidden_states=recv_obj.output_hidden_states,
249
+ placeholder_tokens_idx=None,
250
+ placeholder_tokens_val=None,
249
251
  )
250
252
 
251
253
  def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
252
254
  outputs = self.tokenizer.detokenize(recv_obj)
253
- return BatchMultimodalOut(
255
+ return BatchMultimodalOutput(
254
256
  rids=recv_obj.rids,
255
257
  finished_reasons=recv_obj.finished_reasons,
256
258
  outputs=outputs,
257
259
  prompt_tokens=recv_obj.prompt_tokens,
258
260
  completion_tokens=recv_obj.completion_tokens,
259
261
  cached_tokens=recv_obj.cached_tokens,
262
+ placeholder_tokens_idx=None,
263
+ placeholder_tokens_val=None,
260
264
  )
261
265
 
262
266
  def handle_freeze_gc_req(self, recv_req: FreezeGCReq):
@@ -289,11 +293,11 @@ def run_detokenizer_process(
289
293
  try:
290
294
  manager = DetokenizerManager(server_args, port_args)
291
295
  if server_args.tokenizer_worker_num > 1:
292
- manager.multi_tokenizer_manager_event_loop()
296
+ manager.multi_http_worker_event_loop()
293
297
  else:
294
298
  manager.event_loop()
295
299
  except Exception:
296
- manager.clear_tokenizer_mapping()
300
+ manager.maybe_clear_socket_mapping()
297
301
  traceback = get_exception_traceback()
298
302
  logger.error(f"DetokenizerManager hit an exception: {traceback}")
299
303
  parent_process.send_signal(signal.SIGQUIT)
@@ -0,0 +1,46 @@
1
+ """Start bootstrap/kv-store-related server"""
2
+
3
+ import os
4
+ from typing import Type
5
+
6
+ from sglang.srt.disaggregation.base import BaseKVBootstrapServer
7
+ from sglang.srt.disaggregation.utils import (
8
+ DisaggregationMode,
9
+ KVClassType,
10
+ TransferBackend,
11
+ get_kv_class,
12
+ )
13
+ from sglang.srt.server_args import ServerArgs
14
+
15
+
16
+ def start_disagg_service(
17
+ server_args: ServerArgs,
18
+ ):
19
+ # Start kv boostrap server on prefill
20
+ disagg_mode = DisaggregationMode(server_args.disaggregation_mode)
21
+ transfer_backend = TransferBackend(server_args.disaggregation_transfer_backend)
22
+
23
+ if disagg_mode == DisaggregationMode.PREFILL:
24
+ # only start bootstrap server on prefill tm
25
+ kv_bootstrap_server_class: Type[BaseKVBootstrapServer] = get_kv_class(
26
+ transfer_backend, KVClassType.BOOTSTRAP_SERVER
27
+ )
28
+ bootstrap_server: BaseKVBootstrapServer = kv_bootstrap_server_class(
29
+ host=server_args.host,
30
+ port=server_args.disaggregation_bootstrap_port,
31
+ )
32
+ is_create_store = (
33
+ server_args.node_rank == 0 and transfer_backend == TransferBackend.ASCEND
34
+ )
35
+ if is_create_store:
36
+ try:
37
+ from mf_adapter import create_config_store
38
+
39
+ ascend_url = os.getenv("ASCEND_MF_STORE_URL")
40
+ create_config_store(ascend_url)
41
+ except Exception as e:
42
+ error_message = f"Failed create mf store, invalid ascend_url."
43
+ error_message += f" With exception {e}"
44
+ raise error_message
45
+
46
+ return bootstrap_server