sglang 0.5.2rc2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,675 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import copy
5
+ import logging
6
+ import os
7
+ import time
8
+ import uuid
9
+ from collections import deque
10
+ from typing import (
11
+ TYPE_CHECKING,
12
+ Any,
13
+ Deque,
14
+ Dict,
15
+ Generic,
16
+ List,
17
+ Optional,
18
+ Tuple,
19
+ TypeVar,
20
+ )
21
+
22
+ import fastapi
23
+ import zmq
24
+
25
+ from sglang.srt.managers.io_struct import (
26
+ ClearHiCacheReqInput,
27
+ ClearHiCacheReqOutput,
28
+ CloseSessionReqInput,
29
+ DestroyWeightsUpdateGroupReqInput,
30
+ DestroyWeightsUpdateGroupReqOutput,
31
+ ExpertDistributionReq,
32
+ ExpertDistributionReqOutput,
33
+ ExpertDistributionReqType,
34
+ FlushCacheReqInput,
35
+ FlushCacheReqOutput,
36
+ GetInternalStateReq,
37
+ GetInternalStateReqOutput,
38
+ GetLoadReqInput,
39
+ GetLoadReqOutput,
40
+ GetWeightsByNameReqInput,
41
+ GetWeightsByNameReqOutput,
42
+ InitWeightsSendGroupForRemoteInstanceReqInput,
43
+ InitWeightsSendGroupForRemoteInstanceReqOutput,
44
+ InitWeightsUpdateGroupReqInput,
45
+ InitWeightsUpdateGroupReqOutput,
46
+ LoadLoRAAdapterReqInput,
47
+ LoadLoRAAdapterReqOutput,
48
+ LoRAUpdateOutput,
49
+ MultiTokenizerWrapper,
50
+ OpenSessionReqInput,
51
+ ProfileReq,
52
+ ProfileReqOutput,
53
+ ProfileReqType,
54
+ ReleaseMemoryOccupationReqInput,
55
+ ReleaseMemoryOccupationReqOutput,
56
+ ResumeMemoryOccupationReqInput,
57
+ ResumeMemoryOccupationReqOutput,
58
+ SendWeightsToRemoteInstanceReqInput,
59
+ SendWeightsToRemoteInstanceReqOutput,
60
+ SetInternalStateReq,
61
+ SetInternalStateReqOutput,
62
+ SlowDownReqInput,
63
+ SlowDownReqOutput,
64
+ UnloadLoRAAdapterReqInput,
65
+ UnloadLoRAAdapterReqOutput,
66
+ UpdateWeightsFromDistributedReqInput,
67
+ UpdateWeightsFromDistributedReqOutput,
68
+ UpdateWeightsFromTensorReqInput,
69
+ UpdateWeightsFromTensorReqOutput,
70
+ )
71
+ from sglang.srt.server_args import LoRARef, ServerArgs
72
+ from sglang.srt.utils import get_bool_env_var
73
+ from sglang.utils import TypeBasedDispatcher
74
+
75
+ if TYPE_CHECKING:
76
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
77
+
78
+ T = TypeVar("T")
79
+
80
+ logger = logging.getLogger(__name__)
81
+
82
+
83
+ class _Communicator(Generic[T]):
84
+ """Note: The communicator now only run up to 1 in-flight request at any time."""
85
+
86
+ enable_multi_tokenizer = False
87
+
88
+ def __init__(self, sender: zmq.Socket, fan_out: int, mode="queueing"):
89
+ self._sender = sender
90
+ self._fan_out = fan_out
91
+ self._mode = mode
92
+ self._result_event: Optional[asyncio.Event] = None
93
+ self._result_values: Optional[List[T]] = None
94
+ self._ready_queue: Deque[asyncio.Future] = deque()
95
+
96
+ assert mode in ["queueing", "watching"]
97
+
98
+ async def queueing_call(self, obj: T):
99
+ ready_event = asyncio.Event()
100
+ if self._result_event is not None or len(self._ready_queue) > 0:
101
+ self._ready_queue.append(ready_event)
102
+ await ready_event.wait()
103
+ assert self._result_event is None
104
+ assert self._result_values is None
105
+
106
+ if obj:
107
+ if _Communicator.enable_multi_tokenizer:
108
+ obj = MultiTokenizerWrapper(worker_id=os.getpid(), obj=obj)
109
+ self._sender.send_pyobj(obj)
110
+
111
+ self._result_event = asyncio.Event()
112
+ self._result_values = []
113
+ await self._result_event.wait()
114
+ result_values = self._result_values
115
+ self._result_event = self._result_values = None
116
+
117
+ if len(self._ready_queue) > 0:
118
+ self._ready_queue.popleft().set()
119
+
120
+ return result_values
121
+
122
+ async def watching_call(self, obj):
123
+ if self._result_event is None:
124
+ assert self._result_values is None
125
+ self._result_values = []
126
+ self._result_event = asyncio.Event()
127
+
128
+ if obj:
129
+ if _Communicator.enable_multi_tokenizer:
130
+ obj = MultiTokenizerWrapper(worker_id=os.getpid(), obj=obj)
131
+ self._sender.send_pyobj(obj)
132
+
133
+ await self._result_event.wait()
134
+ result_values = copy.deepcopy(self._result_values)
135
+ self._result_event = self._result_values = None
136
+ return result_values
137
+
138
+ async def __call__(self, obj):
139
+ if self._mode == "queueing":
140
+ return await self.queueing_call(obj)
141
+ else:
142
+ return await self.watching_call(obj)
143
+
144
+ def handle_recv(self, recv_obj: T):
145
+ self._result_values.append(recv_obj)
146
+ if len(self._result_values) == self._fan_out:
147
+ self._result_event.set()
148
+
149
+
150
+ class TokenizerCommunicatorMixin:
151
+ """Mixin class for TokenizerManager to handle communication with the scheduler."""
152
+
153
+ def init_communicators(self: TokenizerManager, server_args: ServerArgs):
154
+ # Communicators
155
+ self.init_weights_update_group_communicator = _Communicator(
156
+ self.send_to_scheduler, server_args.dp_size
157
+ )
158
+ self.destroy_weights_update_group_communicator = _Communicator(
159
+ self.send_to_scheduler, server_args.dp_size
160
+ )
161
+ self.update_weights_from_distributed_communicator = _Communicator(
162
+ self.send_to_scheduler, server_args.dp_size
163
+ )
164
+ self.init_weights_send_group_for_remote_instance_communicator = _Communicator(
165
+ self.send_to_scheduler, server_args.dp_size
166
+ )
167
+ self.send_weights_to_remote_instance_communicator = _Communicator(
168
+ self.send_to_scheduler, server_args.dp_size
169
+ )
170
+ self.update_weights_from_tensor_communicator = _Communicator(
171
+ self.send_to_scheduler, server_args.dp_size
172
+ )
173
+ self.get_weights_by_name_communicator = _Communicator(
174
+ self.send_to_scheduler, server_args.dp_size
175
+ )
176
+ self.release_memory_occupation_communicator = _Communicator(
177
+ self.send_to_scheduler, server_args.dp_size
178
+ )
179
+ self.resume_memory_occupation_communicator = _Communicator(
180
+ self.send_to_scheduler, server_args.dp_size
181
+ )
182
+ self.slow_down_communicator = _Communicator(
183
+ self.send_to_scheduler, server_args.dp_size
184
+ )
185
+ self.flush_cache_communicator = _Communicator(
186
+ self.send_to_scheduler, server_args.dp_size
187
+ )
188
+ self.clear_hicache_storage_communicator = _Communicator(
189
+ self.send_to_scheduler, server_args.dp_size
190
+ )
191
+ self.profile_communicator = _Communicator(
192
+ self.send_to_scheduler, server_args.dp_size
193
+ )
194
+ self.get_internal_state_communicator = _Communicator(
195
+ self.send_to_scheduler, server_args.dp_size
196
+ )
197
+ self.set_internal_state_communicator = _Communicator(
198
+ self.send_to_scheduler, server_args.dp_size
199
+ )
200
+ self.expert_distribution_communicator = _Communicator(
201
+ self.send_to_scheduler, server_args.dp_size
202
+ )
203
+ self.update_lora_adapter_communicator = _Communicator(
204
+ self.send_to_scheduler, server_args.dp_size
205
+ )
206
+ self.get_load_communicator = _Communicator(
207
+ self.send_to_scheduler, server_args.dp_size, mode="watching"
208
+ )
209
+
210
+ self._result_dispatcher += self._get_communicator_dispatcher()
211
+
212
+ def _get_communicator_dispatcher(self: TokenizerManager):
213
+ return TypeBasedDispatcher(
214
+ [
215
+ (
216
+ InitWeightsUpdateGroupReqOutput,
217
+ self.init_weights_update_group_communicator.handle_recv,
218
+ ),
219
+ (
220
+ DestroyWeightsUpdateGroupReqOutput,
221
+ self.destroy_weights_update_group_communicator.handle_recv,
222
+ ),
223
+ (
224
+ UpdateWeightsFromDistributedReqOutput,
225
+ self.update_weights_from_distributed_communicator.handle_recv,
226
+ ),
227
+ (
228
+ InitWeightsSendGroupForRemoteInstanceReqOutput,
229
+ self.init_weights_send_group_for_remote_instance_communicator.handle_recv,
230
+ ),
231
+ (
232
+ SendWeightsToRemoteInstanceReqOutput,
233
+ self.send_weights_to_remote_instance_communicator.handle_recv,
234
+ ),
235
+ (
236
+ UpdateWeightsFromTensorReqOutput,
237
+ self.update_weights_from_tensor_communicator.handle_recv,
238
+ ),
239
+ (
240
+ GetWeightsByNameReqOutput,
241
+ self.get_weights_by_name_communicator.handle_recv,
242
+ ),
243
+ (
244
+ ReleaseMemoryOccupationReqOutput,
245
+ self.release_memory_occupation_communicator.handle_recv,
246
+ ),
247
+ (
248
+ ResumeMemoryOccupationReqOutput,
249
+ self.resume_memory_occupation_communicator.handle_recv,
250
+ ),
251
+ (
252
+ SlowDownReqOutput,
253
+ self.slow_down_communicator.handle_recv,
254
+ ),
255
+ (
256
+ ClearHiCacheReqOutput,
257
+ self.clear_hicache_storage_communicator.handle_recv,
258
+ ),
259
+ (
260
+ FlushCacheReqOutput,
261
+ self.flush_cache_communicator.handle_recv,
262
+ ),
263
+ (
264
+ ProfileReqOutput,
265
+ self.profile_communicator.handle_recv,
266
+ ),
267
+ (
268
+ GetInternalStateReqOutput,
269
+ self.get_internal_state_communicator.handle_recv,
270
+ ),
271
+ (
272
+ SetInternalStateReqOutput,
273
+ self.set_internal_state_communicator.handle_recv,
274
+ ),
275
+ (
276
+ ExpertDistributionReqOutput,
277
+ self.expert_distribution_communicator.handle_recv,
278
+ ),
279
+ (
280
+ LoRAUpdateOutput,
281
+ self.update_lora_adapter_communicator.handle_recv,
282
+ ),
283
+ (
284
+ GetLoadReqOutput,
285
+ self.get_load_communicator.handle_recv,
286
+ ),
287
+ ]
288
+ )
289
+
290
+ async def flush_cache(self: TokenizerManager) -> FlushCacheReqOutput:
291
+ return (await self.flush_cache_communicator(FlushCacheReqInput()))[0]
292
+
293
+ async def clear_hicache_storage(self: TokenizerManager) -> ClearHiCacheReqOutput:
294
+ """Clear the hierarchical cache storage."""
295
+ # Delegate to the scheduler to handle HiCacheStorage clearing
296
+ return (await self.clear_hicache_storage_communicator(ClearHiCacheReqInput()))[
297
+ 0
298
+ ]
299
+
300
+ async def start_profile(
301
+ self: TokenizerManager,
302
+ output_dir: Optional[str] = None,
303
+ start_step: Optional[int] = None,
304
+ num_steps: Optional[int] = None,
305
+ activities: Optional[List[str]] = None,
306
+ with_stack: Optional[bool] = None,
307
+ record_shapes: Optional[bool] = None,
308
+ profile_by_stage: bool = False,
309
+ ):
310
+ self.auto_create_handle_loop()
311
+ env_with_stack: bool = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "true")
312
+ with_stack = False if with_stack is False or env_with_stack is False else True
313
+ req = ProfileReq(
314
+ type=ProfileReqType.START_PROFILE,
315
+ output_dir=output_dir,
316
+ start_step=start_step,
317
+ num_steps=num_steps,
318
+ activities=activities,
319
+ with_stack=with_stack,
320
+ record_shapes=record_shapes,
321
+ profile_by_stage=profile_by_stage,
322
+ profile_id=str(time.time()),
323
+ )
324
+ return await self._execute_profile(req)
325
+
326
+ async def stop_profile(self: TokenizerManager):
327
+ self.auto_create_handle_loop()
328
+ req = ProfileReq(type=ProfileReqType.STOP_PROFILE)
329
+ return await self._execute_profile(req)
330
+
331
+ async def _execute_profile(self: TokenizerManager, req: ProfileReq):
332
+ result = (await self.profile_communicator(req))[0]
333
+ if not result.success:
334
+ raise RuntimeError(result.message)
335
+ return result
336
+
337
+ async def start_expert_distribution_record(self: TokenizerManager):
338
+ self.auto_create_handle_loop()
339
+ req = ExpertDistributionReq(action=ExpertDistributionReqType.START_RECORD)
340
+ await self.expert_distribution_communicator(req)
341
+
342
+ async def stop_expert_distribution_record(self: TokenizerManager):
343
+ self.auto_create_handle_loop()
344
+ req = ExpertDistributionReq(action=ExpertDistributionReqType.STOP_RECORD)
345
+ await self.expert_distribution_communicator(req)
346
+
347
+ async def dump_expert_distribution_record(self: TokenizerManager):
348
+ self.auto_create_handle_loop()
349
+ req = ExpertDistributionReq(action=ExpertDistributionReqType.DUMP_RECORD)
350
+ await self.expert_distribution_communicator(req)
351
+
352
+ async def init_weights_update_group(
353
+ self: TokenizerManager,
354
+ obj: InitWeightsUpdateGroupReqInput,
355
+ request: Optional[fastapi.Request] = None,
356
+ ) -> Tuple[bool, str]:
357
+ self.auto_create_handle_loop()
358
+ assert (
359
+ self.server_args.dp_size == 1
360
+ ), "dp_size must be 1 for init parameter update group"
361
+ result = (await self.init_weights_update_group_communicator(obj))[0]
362
+ return result.success, result.message
363
+
364
+ async def destroy_weights_update_group(
365
+ self,
366
+ obj: DestroyWeightsUpdateGroupReqInput,
367
+ request: Optional[fastapi.Request] = None,
368
+ ) -> Tuple[bool, str]:
369
+ self.auto_create_handle_loop()
370
+ assert (
371
+ self.server_args.dp_size == 1
372
+ ), "dp_size must be 1 for destroy parameter update group"
373
+ result = (await self.destroy_weights_update_group_communicator(obj))[0]
374
+ return result.success, result.message
375
+
376
+ async def update_weights_from_distributed(
377
+ self: TokenizerManager,
378
+ obj: UpdateWeightsFromDistributedReqInput,
379
+ request: Optional[fastapi.Request] = None,
380
+ ) -> Tuple[bool, str]:
381
+ self.auto_create_handle_loop()
382
+ assert (
383
+ self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
384
+ ), "dp_size must be 1 or dp attention must be enabled for update weights from distributed"
385
+
386
+ if obj.abort_all_requests:
387
+ self.abort_request(abort_all=True)
388
+
389
+ # This means that weight sync
390
+ # cannot run while requests are in progress.
391
+ async with self.model_update_lock.writer_lock:
392
+ result = (await self.update_weights_from_distributed_communicator(obj))[0]
393
+ return result.success, result.message
394
+
395
+ async def init_weights_send_group_for_remote_instance(
396
+ self,
397
+ obj: InitWeightsSendGroupForRemoteInstanceReqInput,
398
+ request: Optional[fastapi.Request] = None,
399
+ ) -> Tuple[bool, str]:
400
+ self.auto_create_handle_loop()
401
+ # TODO: support DP
402
+ assert (
403
+ self.server_args.dp_size == 1
404
+ ), "dp_size must be 1 for init_weights_send_group_for_remote_instance"
405
+ result = (
406
+ await self.init_weights_send_group_for_remote_instance_communicator(obj)
407
+ )[0]
408
+ return result.success, result.message
409
+
410
+ async def send_weights_to_remote_instance(
411
+ self,
412
+ obj: SendWeightsToRemoteInstanceReqInput,
413
+ request: Optional[fastapi.Request] = None,
414
+ ) -> Tuple[bool, str]:
415
+ self.auto_create_handle_loop()
416
+ # TODO: support DP
417
+ assert (
418
+ self.server_args.dp_size == 1
419
+ ), "dp_size must be 1 for send_weights_to_remote_instance"
420
+ result = (await self.send_weights_to_remote_instance_communicator(obj))[0]
421
+ return result.success, result.message
422
+
423
+ async def update_weights_from_tensor(
424
+ self: TokenizerManager,
425
+ obj: UpdateWeightsFromTensorReqInput,
426
+ request: Optional[fastapi.Request] = None,
427
+ ) -> Tuple[bool, str]:
428
+ self.auto_create_handle_loop()
429
+ assert (
430
+ self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
431
+ ), "dp_size must be 1 or dp attention must be enabled for update weights from tensor"
432
+
433
+ if obj.abort_all_requests:
434
+ self.abort_request(abort_all=True)
435
+
436
+ # This means that weight sync
437
+ # cannot run while requests are in progress.
438
+ async with self.model_update_lock.writer_lock:
439
+ result = (await self.update_weights_from_tensor_communicator(obj))[0]
440
+ return result.success, result.message
441
+
442
+ async def load_lora_adapter(
443
+ self: TokenizerManager,
444
+ obj: LoadLoRAAdapterReqInput,
445
+ _: Optional[fastapi.Request] = None,
446
+ ) -> LoadLoRAAdapterReqOutput:
447
+ self.auto_create_handle_loop()
448
+
449
+ try:
450
+ if not self.server_args.enable_lora:
451
+ raise ValueError(
452
+ "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
453
+ )
454
+
455
+ # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
456
+ # with dp_size > 1.
457
+ assert (
458
+ self.server_args.dp_size == 1
459
+ ), "dp_size must be 1 for dynamic lora loading"
460
+ logger.info(
461
+ "Start load Lora adapter. Lora name=%s, path=%s",
462
+ obj.lora_name,
463
+ obj.lora_path,
464
+ )
465
+
466
+ async with self.lora_update_lock:
467
+ if (
468
+ self.server_args.max_loaded_loras is not None
469
+ and self.lora_registry.num_registered_loras
470
+ >= self.server_args.max_loaded_loras
471
+ ):
472
+ raise ValueError(
473
+ f"Cannot load LoRA adapter {obj.lora_name} at path {obj.lora_path}. "
474
+ f"Maximum number of loaded LoRA adapters is {self.server_args.max_loaded_loras}. "
475
+ "Please unload some LoRA adapters before loading new ones."
476
+ )
477
+
478
+ # Generate new uniquely identifiable LoRARef object.
479
+ new_adapter = LoRARef(
480
+ lora_name=obj.lora_name,
481
+ lora_path=obj.lora_path,
482
+ pinned=obj.pinned,
483
+ )
484
+
485
+ # Trigger the actual loading operation at the backend processes.
486
+ obj.lora_id = new_adapter.lora_id
487
+ result = (await self.update_lora_adapter_communicator(obj))[0]
488
+
489
+ # Register the LoRA adapter only after loading is successful.
490
+ if result.success:
491
+ await self.lora_registry.register(new_adapter)
492
+
493
+ return result
494
+ except ValueError as e:
495
+ return LoadLoRAAdapterReqOutput(
496
+ success=False,
497
+ error_message=str(e),
498
+ )
499
+
500
+ async def unload_lora_adapter(
501
+ self: TokenizerManager,
502
+ obj: UnloadLoRAAdapterReqInput,
503
+ _: Optional[fastapi.Request] = None,
504
+ ) -> UnloadLoRAAdapterReqOutput:
505
+ self.auto_create_handle_loop()
506
+
507
+ try:
508
+ if not self.server_args.enable_lora:
509
+ raise ValueError(
510
+ "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
511
+ )
512
+
513
+ assert (
514
+ obj.lora_name is not None
515
+ ), "lora_name must be provided to unload LoRA adapter"
516
+
517
+ # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
518
+ # with dp_size > 1.
519
+ assert (
520
+ self.server_args.dp_size == 1
521
+ ), "dp_size must be 1 for dynamic lora loading"
522
+ logger.info(
523
+ "Start unload Lora adapter. Lora name=%s",
524
+ obj.lora_name,
525
+ )
526
+
527
+ async with self.lora_update_lock:
528
+ # Unregister the LoRA adapter from the registry to stop new requests for this adapter
529
+ # from being started.
530
+ lora_id = await self.lora_registry.unregister(obj.lora_name)
531
+ obj.lora_id = lora_id
532
+
533
+ # Initiate the actual unloading operation at the backend processes only after all
534
+ # ongoing requests using this LoRA adapter are finished.
535
+ await self.lora_registry.wait_for_unload(lora_id)
536
+ result = (await self.update_lora_adapter_communicator(obj))[0]
537
+
538
+ return result
539
+ except ValueError as e:
540
+ return UnloadLoRAAdapterReqOutput(success=False, error_message=str(e))
541
+
542
+ async def get_weights_by_name(
543
+ self: TokenizerManager,
544
+ obj: GetWeightsByNameReqInput,
545
+ request: Optional[fastapi.Request] = None,
546
+ ):
547
+ self.auto_create_handle_loop()
548
+ results = await self.get_weights_by_name_communicator(obj)
549
+ all_parameters = [r.parameter for r in results]
550
+ if self.server_args.dp_size == 1:
551
+ return all_parameters[0]
552
+ else:
553
+ return all_parameters
554
+
555
+ async def release_memory_occupation(
556
+ self: TokenizerManager,
557
+ obj: ReleaseMemoryOccupationReqInput,
558
+ request: Optional[fastapi.Request] = None,
559
+ ):
560
+ self.auto_create_handle_loop()
561
+ await self.release_memory_occupation_communicator(obj)
562
+
563
+ async def resume_memory_occupation(
564
+ self: TokenizerManager,
565
+ obj: ResumeMemoryOccupationReqInput,
566
+ request: Optional[fastapi.Request] = None,
567
+ ):
568
+ self.auto_create_handle_loop()
569
+ await self.resume_memory_occupation_communicator(obj)
570
+
571
+ async def slow_down(
572
+ self: TokenizerManager,
573
+ obj: SlowDownReqInput,
574
+ request: Optional[fastapi.Request] = None,
575
+ ):
576
+ self.auto_create_handle_loop()
577
+ await self.slow_down_communicator(obj)
578
+
579
+ async def get_internal_state(self: TokenizerManager) -> List[Dict[Any, Any]]:
580
+ req = GetInternalStateReq()
581
+ responses: List[GetInternalStateReqOutput] = (
582
+ await self.get_internal_state_communicator(req)
583
+ )
584
+ # Many DP ranks
585
+ return [res.internal_state for res in responses]
586
+
587
+ async def set_internal_state(
588
+ self: TokenizerManager, obj: SetInternalStateReq
589
+ ) -> List[bool]:
590
+ responses: List[SetInternalStateReqOutput] = (
591
+ await self.set_internal_state_communicator(obj)
592
+ )
593
+ return [res.updated for res in responses]
594
+
595
+ async def get_load(self: TokenizerManager) -> List[GetLoadReqOutput]:
596
+ req = GetLoadReqInput()
597
+ return await self.get_load_communicator(req)
598
+
599
+ async def open_session(
600
+ self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None
601
+ ):
602
+ self.auto_create_handle_loop()
603
+
604
+ if obj.session_id is None:
605
+ obj.session_id = uuid.uuid4().hex
606
+ elif obj.session_id in self.session_futures:
607
+ return None
608
+
609
+ if self.server_args.tokenizer_worker_num > 1:
610
+ obj = MultiTokenizerWrapper(self.worker_id, obj)
611
+ self.send_to_scheduler.send_pyobj(obj)
612
+
613
+ self.session_futures[obj.session_id] = asyncio.Future()
614
+ session_id = await self.session_futures[obj.session_id]
615
+ del self.session_futures[obj.session_id]
616
+ return session_id
617
+
618
+ async def close_session(
619
+ self, obj: CloseSessionReqInput, request: Optional[fastapi.Request] = None
620
+ ):
621
+ await self.send_to_scheduler.send_pyobj(obj)
622
+
623
+ def get_log_request_metadata(self):
624
+ max_length = None
625
+ skip_names = None
626
+ out_skip_names = None
627
+ if self.log_requests:
628
+ if self.log_requests_level == 0:
629
+ max_length = 1 << 30
630
+ skip_names = set(
631
+ [
632
+ "text",
633
+ "input_ids",
634
+ "input_embeds",
635
+ "image_data",
636
+ "audio_data",
637
+ "lora_path",
638
+ "sampling_params",
639
+ ]
640
+ )
641
+ out_skip_names = set(
642
+ [
643
+ "text",
644
+ "output_ids",
645
+ "embedding",
646
+ ]
647
+ )
648
+ elif self.log_requests_level == 1:
649
+ max_length = 1 << 30
650
+ skip_names = set(
651
+ [
652
+ "text",
653
+ "input_ids",
654
+ "input_embeds",
655
+ "image_data",
656
+ "audio_data",
657
+ "lora_path",
658
+ ]
659
+ )
660
+ out_skip_names = set(
661
+ [
662
+ "text",
663
+ "output_ids",
664
+ "embedding",
665
+ ]
666
+ )
667
+ elif self.log_requests_level == 2:
668
+ max_length = 2048
669
+ elif self.log_requests_level == 3:
670
+ max_length = 1 << 30
671
+ else:
672
+ raise ValueError(
673
+ f"Invalid --log-requests-level: {self.log_requests_level=}"
674
+ )
675
+ return max_length, skip_names, out_skip_names