sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. sglang/bench_one_batch_server.py +10 -1
  2. sglang/bench_serving.py +257 -29
  3. sglang/srt/configs/__init__.py +4 -0
  4. sglang/srt/configs/device_config.py +3 -1
  5. sglang/srt/configs/dots_vlm.py +139 -0
  6. sglang/srt/configs/load_config.py +1 -0
  7. sglang/srt/configs/model_config.py +50 -6
  8. sglang/srt/configs/qwen3_next.py +326 -0
  9. sglang/srt/connector/__init__.py +8 -1
  10. sglang/srt/connector/remote_instance.py +82 -0
  11. sglang/srt/constrained/base_grammar_backend.py +48 -12
  12. sglang/srt/constrained/llguidance_backend.py +0 -1
  13. sglang/srt/constrained/outlines_backend.py +0 -1
  14. sglang/srt/constrained/xgrammar_backend.py +28 -9
  15. sglang/srt/custom_op.py +11 -1
  16. sglang/srt/debug_utils/dump_comparator.py +81 -44
  17. sglang/srt/debug_utils/dump_loader.py +97 -0
  18. sglang/srt/debug_utils/dumper.py +11 -3
  19. sglang/srt/debug_utils/text_comparator.py +73 -11
  20. sglang/srt/disaggregation/base/conn.py +1 -1
  21. sglang/srt/disaggregation/common/conn.py +15 -12
  22. sglang/srt/disaggregation/decode.py +21 -10
  23. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
  24. sglang/srt/disaggregation/fake/conn.py +1 -1
  25. sglang/srt/disaggregation/mini_lb.py +6 -445
  26. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  27. sglang/srt/disaggregation/nixl/conn.py +180 -16
  28. sglang/srt/disaggregation/prefill.py +5 -3
  29. sglang/srt/disaggregation/utils.py +5 -50
  30. sglang/srt/distributed/parallel_state.py +24 -3
  31. sglang/srt/entrypoints/engine.py +38 -17
  32. sglang/srt/entrypoints/grpc_request_manager.py +580 -0
  33. sglang/srt/entrypoints/grpc_server.py +680 -0
  34. sglang/srt/entrypoints/http_server.py +85 -54
  35. sglang/srt/entrypoints/openai/protocol.py +4 -1
  36. sglang/srt/entrypoints/openai/serving_base.py +46 -3
  37. sglang/srt/entrypoints/openai/serving_chat.py +36 -16
  38. sglang/srt/entrypoints/openai/serving_completions.py +12 -3
  39. sglang/srt/entrypoints/openai/serving_embedding.py +8 -3
  40. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  41. sglang/srt/entrypoints/openai/serving_responses.py +6 -3
  42. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  43. sglang/srt/eplb/eplb_manager.py +2 -2
  44. sglang/srt/eplb/expert_distribution.py +26 -13
  45. sglang/srt/eplb/expert_location.py +8 -3
  46. sglang/srt/eplb/expert_location_updater.py +1 -1
  47. sglang/srt/function_call/base_format_detector.py +3 -6
  48. sglang/srt/function_call/ebnf_composer.py +11 -9
  49. sglang/srt/function_call/function_call_parser.py +6 -0
  50. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  51. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  52. sglang/srt/grpc/__init__.py +1 -0
  53. sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
  54. sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
  55. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
  56. sglang/srt/hf_transformers_utils.py +4 -0
  57. sglang/srt/layers/activation.py +142 -9
  58. sglang/srt/layers/attention/ascend_backend.py +11 -4
  59. sglang/srt/layers/attention/fla/chunk.py +242 -0
  60. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  61. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  62. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  63. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  64. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  65. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  66. sglang/srt/layers/attention/fla/index.py +37 -0
  67. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  68. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  69. sglang/srt/layers/attention/fla/op.py +66 -0
  70. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  71. sglang/srt/layers/attention/fla/utils.py +331 -0
  72. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  73. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  74. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  75. sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
  76. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  77. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  78. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  79. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  80. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  81. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  82. sglang/srt/layers/attention/triton_backend.py +18 -1
  83. sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
  84. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  85. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  86. sglang/srt/layers/dp_attention.py +30 -1
  87. sglang/srt/layers/layernorm.py +32 -15
  88. sglang/srt/layers/linear.py +34 -3
  89. sglang/srt/layers/logits_processor.py +29 -10
  90. sglang/srt/layers/moe/__init__.py +2 -1
  91. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  92. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  93. sglang/srt/layers/moe/ep_moe/layer.py +182 -62
  94. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
  95. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  96. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  97. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  98. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  99. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  100. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  101. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  102. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  103. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  104. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  105. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  106. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  107. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
  108. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  109. sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
  110. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  111. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  112. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  113. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  114. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  115. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  116. sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
  117. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  118. sglang/srt/layers/moe/topk.py +30 -9
  119. sglang/srt/layers/moe/utils.py +12 -6
  120. sglang/srt/layers/quantization/awq.py +19 -7
  121. sglang/srt/layers/quantization/base_config.py +11 -6
  122. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  123. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  124. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  125. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  126. sglang/srt/layers/quantization/fp8.py +76 -47
  127. sglang/srt/layers/quantization/fp8_utils.py +50 -31
  128. sglang/srt/layers/quantization/gptq.py +25 -17
  129. sglang/srt/layers/quantization/modelopt_quant.py +147 -47
  130. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  131. sglang/srt/layers/quantization/mxfp4.py +64 -40
  132. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  133. sglang/srt/layers/quantization/unquant.py +135 -47
  134. sglang/srt/layers/quantization/w4afp8.py +30 -17
  135. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  136. sglang/srt/layers/quantization/w8a8_int8.py +76 -38
  137. sglang/srt/layers/sampler.py +162 -18
  138. sglang/srt/lora/backend/base_backend.py +50 -8
  139. sglang/srt/lora/backend/triton_backend.py +90 -2
  140. sglang/srt/lora/layers.py +32 -0
  141. sglang/srt/lora/lora.py +4 -1
  142. sglang/srt/lora/lora_manager.py +35 -112
  143. sglang/srt/lora/mem_pool.py +24 -10
  144. sglang/srt/lora/utils.py +18 -9
  145. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  146. sglang/srt/managers/cache_controller.py +158 -160
  147. sglang/srt/managers/data_parallel_controller.py +105 -35
  148. sglang/srt/managers/detokenizer_manager.py +8 -4
  149. sglang/srt/managers/disagg_service.py +46 -0
  150. sglang/srt/managers/io_struct.py +199 -12
  151. sglang/srt/managers/mm_utils.py +1 -0
  152. sglang/srt/managers/multi_tokenizer_mixin.py +350 -400
  153. sglang/srt/managers/schedule_batch.py +77 -56
  154. sglang/srt/managers/schedule_policy.py +1 -1
  155. sglang/srt/managers/scheduler.py +187 -39
  156. sglang/srt/managers/scheduler_metrics_mixin.py +4 -3
  157. sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
  158. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  159. sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
  160. sglang/srt/managers/tokenizer_manager.py +259 -519
  161. sglang/srt/managers/tp_worker.py +53 -4
  162. sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
  163. sglang/srt/mem_cache/hicache_storage.py +3 -23
  164. sglang/srt/mem_cache/hiradix_cache.py +103 -43
  165. sglang/srt/mem_cache/memory_pool.py +347 -48
  166. sglang/srt/mem_cache/memory_pool_host.py +105 -46
  167. sglang/srt/mem_cache/radix_cache.py +0 -2
  168. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  169. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  170. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +86 -4
  171. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  172. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  173. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +49 -7
  174. sglang/srt/mem_cache/swa_radix_cache.py +0 -2
  175. sglang/srt/metrics/collector.py +493 -76
  176. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  177. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  178. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  179. sglang/srt/model_executor/forward_batch_info.py +59 -2
  180. sglang/srt/model_executor/model_runner.py +356 -29
  181. sglang/srt/model_loader/__init__.py +9 -3
  182. sglang/srt/model_loader/loader.py +128 -4
  183. sglang/srt/model_loader/weight_utils.py +2 -1
  184. sglang/srt/models/apertus.py +686 -0
  185. sglang/srt/models/bailing_moe.py +798 -218
  186. sglang/srt/models/bailing_moe_nextn.py +168 -0
  187. sglang/srt/models/deepseek_v2.py +109 -15
  188. sglang/srt/models/dots_vlm.py +174 -0
  189. sglang/srt/models/dots_vlm_vit.py +337 -0
  190. sglang/srt/models/ernie4.py +1 -1
  191. sglang/srt/models/gemma3n_mm.py +1 -1
  192. sglang/srt/models/glm4_moe.py +1 -1
  193. sglang/srt/models/glm4v.py +4 -2
  194. sglang/srt/models/glm4v_moe.py +3 -0
  195. sglang/srt/models/gpt_oss.py +1 -1
  196. sglang/srt/models/llama4.py +9 -0
  197. sglang/srt/models/llama_eagle3.py +13 -0
  198. sglang/srt/models/longcat_flash.py +2 -2
  199. sglang/srt/models/mllama4.py +25 -0
  200. sglang/srt/models/opt.py +637 -0
  201. sglang/srt/models/qwen2.py +7 -0
  202. sglang/srt/models/qwen2_5_vl.py +27 -3
  203. sglang/srt/models/qwen2_moe.py +56 -12
  204. sglang/srt/models/qwen3_moe.py +1 -1
  205. sglang/srt/models/qwen3_next.py +1042 -0
  206. sglang/srt/models/qwen3_next_mtp.py +112 -0
  207. sglang/srt/models/step3_vl.py +1 -1
  208. sglang/srt/multimodal/processors/dots_vlm.py +99 -0
  209. sglang/srt/multimodal/processors/glm4v.py +9 -9
  210. sglang/srt/multimodal/processors/internvl.py +141 -129
  211. sglang/srt/multimodal/processors/qwen_vl.py +15 -5
  212. sglang/srt/offloader.py +27 -3
  213. sglang/srt/remote_instance_weight_loader_utils.py +69 -0
  214. sglang/srt/sampling/sampling_batch_info.py +18 -15
  215. sglang/srt/server_args.py +276 -35
  216. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  217. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  218. sglang/srt/speculative/eagle_utils.py +0 -2
  219. sglang/srt/speculative/eagle_worker.py +43 -4
  220. sglang/srt/speculative/spec_info.py +5 -0
  221. sglang/srt/speculative/standalone_worker.py +109 -0
  222. sglang/srt/tracing/trace.py +552 -0
  223. sglang/srt/utils.py +34 -3
  224. sglang/srt/weight_sync/utils.py +1 -1
  225. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  226. sglang/test/runners.py +4 -0
  227. sglang/test/test_cutlass_moe.py +24 -6
  228. sglang/test/test_disaggregation_utils.py +66 -0
  229. sglang/test/test_fp4_moe.py +370 -1
  230. sglang/test/test_utils.py +28 -1
  231. sglang/utils.py +11 -0
  232. sglang/version.py +1 -1
  233. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
  234. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +237 -178
  235. sglang/srt/disaggregation/launch_lb.py +0 -118
  236. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
  237. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
  238. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,569 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import copy
5
+ import logging
6
+ import os
7
+ import time
8
+ from collections import deque
9
+ from typing import (
10
+ TYPE_CHECKING,
11
+ Any,
12
+ Deque,
13
+ Dict,
14
+ Generic,
15
+ List,
16
+ Optional,
17
+ Tuple,
18
+ TypeVar,
19
+ )
20
+
21
+ import fastapi
22
+ import zmq
23
+
24
+ from sglang.srt.managers.io_struct import (
25
+ ClearHiCacheReqInput,
26
+ ClearHiCacheReqOutput,
27
+ ExpertDistributionReq,
28
+ ExpertDistributionReqOutput,
29
+ FlushCacheReqInput,
30
+ FlushCacheReqOutput,
31
+ GetInternalStateReq,
32
+ GetInternalStateReqOutput,
33
+ GetLoadReqInput,
34
+ GetLoadReqOutput,
35
+ GetWeightsByNameReqInput,
36
+ GetWeightsByNameReqOutput,
37
+ InitWeightsSendGroupForRemoteInstanceReqInput,
38
+ InitWeightsSendGroupForRemoteInstanceReqOutput,
39
+ InitWeightsUpdateGroupReqInput,
40
+ InitWeightsUpdateGroupReqOutput,
41
+ LoadLoRAAdapterReqInput,
42
+ LoadLoRAAdapterReqOutput,
43
+ LoRAUpdateResult,
44
+ MultiTokenizerWrapper,
45
+ ProfileReq,
46
+ ProfileReqOutput,
47
+ ProfileReqType,
48
+ ReleaseMemoryOccupationReqInput,
49
+ ReleaseMemoryOccupationReqOutput,
50
+ ResumeMemoryOccupationReqInput,
51
+ ResumeMemoryOccupationReqOutput,
52
+ SendWeightsToRemoteInstanceReqInput,
53
+ SendWeightsToRemoteInstanceReqOutput,
54
+ SetInternalStateReq,
55
+ SetInternalStateReqOutput,
56
+ SlowDownReqInput,
57
+ SlowDownReqOutput,
58
+ UnloadLoRAAdapterReqInput,
59
+ UnloadLoRAAdapterReqOutput,
60
+ UpdateWeightsFromDistributedReqInput,
61
+ UpdateWeightsFromDistributedReqOutput,
62
+ UpdateWeightsFromTensorReqInput,
63
+ UpdateWeightsFromTensorReqOutput,
64
+ )
65
+ from sglang.srt.server_args import LoRARef, ServerArgs
66
+ from sglang.srt.utils import get_bool_env_var
67
+ from sglang.utils import TypeBasedDispatcher
68
+
69
+ if TYPE_CHECKING:
70
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
71
+
72
+ T = TypeVar("T")
73
+
74
+ logger = logging.getLogger(__name__)
75
+
76
+
77
+ class _Communicator(Generic[T]):
78
+ """Note: The communicator now only run up to 1 in-flight request at any time."""
79
+
80
+ enable_multi_tokenizer = False
81
+
82
+ def __init__(self, sender: zmq.Socket, fan_out: int, mode="queueing"):
83
+ self._sender = sender
84
+ self._fan_out = fan_out
85
+ self._mode = mode
86
+ self._result_event: Optional[asyncio.Event] = None
87
+ self._result_values: Optional[List[T]] = None
88
+ self._ready_queue: Deque[asyncio.Future] = deque()
89
+
90
+ assert mode in ["queueing", "watching"]
91
+
92
+ async def queueing_call(self, obj: T):
93
+ ready_event = asyncio.Event()
94
+ if self._result_event is not None or len(self._ready_queue) > 0:
95
+ self._ready_queue.append(ready_event)
96
+ await ready_event.wait()
97
+ assert self._result_event is None
98
+ assert self._result_values is None
99
+
100
+ if obj:
101
+ if _Communicator.enable_multi_tokenizer:
102
+ obj = MultiTokenizerWrapper(worker_id=os.getpid(), obj=obj)
103
+ self._sender.send_pyobj(obj)
104
+
105
+ self._result_event = asyncio.Event()
106
+ self._result_values = []
107
+ await self._result_event.wait()
108
+ result_values = self._result_values
109
+ self._result_event = self._result_values = None
110
+
111
+ if len(self._ready_queue) > 0:
112
+ self._ready_queue.popleft().set()
113
+
114
+ return result_values
115
+
116
+ async def watching_call(self, obj):
117
+ if self._result_event is None:
118
+ assert self._result_values is None
119
+ self._result_values = []
120
+ self._result_event = asyncio.Event()
121
+
122
+ if obj:
123
+ if _Communicator.enable_multi_tokenizer:
124
+ obj = MultiTokenizerWrapper(worker_id=os.getpid(), obj=obj)
125
+ self._sender.send_pyobj(obj)
126
+
127
+ await self._result_event.wait()
128
+ result_values = copy.deepcopy(self._result_values)
129
+ self._result_event = self._result_values = None
130
+ return result_values
131
+
132
+ async def __call__(self, obj):
133
+ if self._mode == "queueing":
134
+ return await self.queueing_call(obj)
135
+ else:
136
+ return await self.watching_call(obj)
137
+
138
+ def handle_recv(self, recv_obj: T):
139
+ self._result_values.append(recv_obj)
140
+ if len(self._result_values) == self._fan_out:
141
+ self._result_event.set()
142
+
143
+
144
+ class TokenizerCommunicatorMixin:
145
+ """Mixin class for TokenizerManager to handle communication with the scheduler."""
146
+
147
+ def init_communicators(self: TokenizerManager, server_args: ServerArgs):
148
+ # Communicators
149
+ self.init_weights_update_group_communicator = _Communicator(
150
+ self.send_to_scheduler, server_args.dp_size
151
+ )
152
+ self.update_weights_from_distributed_communicator = _Communicator(
153
+ self.send_to_scheduler, server_args.dp_size
154
+ )
155
+ self.init_weights_send_group_for_remote_instance_communicator = _Communicator(
156
+ self.send_to_scheduler, server_args.dp_size
157
+ )
158
+ self.send_weights_to_remote_instance_communicator = _Communicator(
159
+ self.send_to_scheduler, server_args.dp_size
160
+ )
161
+ self.update_weights_from_tensor_communicator = _Communicator(
162
+ self.send_to_scheduler, server_args.dp_size
163
+ )
164
+ self.get_weights_by_name_communicator = _Communicator(
165
+ self.send_to_scheduler, server_args.dp_size
166
+ )
167
+ self.release_memory_occupation_communicator = _Communicator(
168
+ self.send_to_scheduler, server_args.dp_size
169
+ )
170
+ self.resume_memory_occupation_communicator = _Communicator(
171
+ self.send_to_scheduler, server_args.dp_size
172
+ )
173
+ self.slow_down_communicator = _Communicator(
174
+ self.send_to_scheduler, server_args.dp_size
175
+ )
176
+ self.flush_cache_communicator = _Communicator(
177
+ self.send_to_scheduler, server_args.dp_size
178
+ )
179
+ self.clear_hicache_storage_communicator = _Communicator(
180
+ self.send_to_scheduler, server_args.dp_size
181
+ )
182
+ self.profile_communicator = _Communicator(
183
+ self.send_to_scheduler, server_args.dp_size
184
+ )
185
+ self.get_internal_state_communicator = _Communicator(
186
+ self.send_to_scheduler, server_args.dp_size
187
+ )
188
+ self.set_internal_state_communicator = _Communicator(
189
+ self.send_to_scheduler, server_args.dp_size
190
+ )
191
+ self.expert_distribution_communicator = _Communicator(
192
+ self.send_to_scheduler, server_args.dp_size
193
+ )
194
+ self.update_lora_adapter_communicator = _Communicator(
195
+ self.send_to_scheduler, server_args.dp_size
196
+ )
197
+ self.get_load_communicator = _Communicator(
198
+ self.send_to_scheduler, server_args.dp_size, mode="watching"
199
+ )
200
+
201
+ self._result_dispatcher += self._get_communicator_dispatcher()
202
+
203
+ def _get_communicator_dispatcher(self: TokenizerManager):
204
+ return TypeBasedDispatcher(
205
+ [
206
+ (
207
+ InitWeightsUpdateGroupReqOutput,
208
+ self.init_weights_update_group_communicator.handle_recv,
209
+ ),
210
+ (
211
+ UpdateWeightsFromDistributedReqOutput,
212
+ self.update_weights_from_distributed_communicator.handle_recv,
213
+ ),
214
+ (
215
+ InitWeightsSendGroupForRemoteInstanceReqOutput,
216
+ self.init_weights_send_group_for_remote_instance_communicator.handle_recv,
217
+ ),
218
+ (
219
+ SendWeightsToRemoteInstanceReqOutput,
220
+ self.send_weights_to_remote_instance_communicator.handle_recv,
221
+ ),
222
+ (
223
+ UpdateWeightsFromTensorReqOutput,
224
+ self.update_weights_from_tensor_communicator.handle_recv,
225
+ ),
226
+ (
227
+ GetWeightsByNameReqOutput,
228
+ self.get_weights_by_name_communicator.handle_recv,
229
+ ),
230
+ (
231
+ ReleaseMemoryOccupationReqOutput,
232
+ self.release_memory_occupation_communicator.handle_recv,
233
+ ),
234
+ (
235
+ ResumeMemoryOccupationReqOutput,
236
+ self.resume_memory_occupation_communicator.handle_recv,
237
+ ),
238
+ (
239
+ SlowDownReqOutput,
240
+ self.slow_down_communicator.handle_recv,
241
+ ),
242
+ (
243
+ ClearHiCacheReqOutput,
244
+ self.clear_hicache_storage_communicator.handle_recv,
245
+ ),
246
+ (
247
+ FlushCacheReqOutput,
248
+ self.flush_cache_communicator.handle_recv,
249
+ ),
250
+ (
251
+ ProfileReqOutput,
252
+ self.profile_communicator.handle_recv,
253
+ ),
254
+ (
255
+ GetInternalStateReqOutput,
256
+ self.get_internal_state_communicator.handle_recv,
257
+ ),
258
+ (
259
+ SetInternalStateReqOutput,
260
+ self.set_internal_state_communicator.handle_recv,
261
+ ),
262
+ (
263
+ ExpertDistributionReqOutput,
264
+ self.expert_distribution_communicator.handle_recv,
265
+ ),
266
+ (
267
+ LoRAUpdateResult,
268
+ self.update_lora_adapter_communicator.handle_recv,
269
+ ),
270
+ (
271
+ GetLoadReqOutput,
272
+ self.get_load_communicator.handle_recv,
273
+ ),
274
+ ]
275
+ )
276
+
277
+ async def flush_cache(self: TokenizerManager) -> FlushCacheReqOutput:
278
+ return (await self.flush_cache_communicator(FlushCacheReqInput()))[0]
279
+
280
+ async def clear_hicache_storage(self: TokenizerManager) -> ClearHiCacheReqOutput:
281
+ """Clear the hierarchical cache storage."""
282
+ # Delegate to the scheduler to handle HiCacheStorage clearing
283
+ return (await self.clear_hicache_storage_communicator(ClearHiCacheReqInput()))[
284
+ 0
285
+ ]
286
+
287
+ async def start_profile(
288
+ self: TokenizerManager,
289
+ output_dir: Optional[str] = None,
290
+ start_step: Optional[int] = None,
291
+ num_steps: Optional[int] = None,
292
+ activities: Optional[List[str]] = None,
293
+ with_stack: Optional[bool] = None,
294
+ record_shapes: Optional[bool] = None,
295
+ profile_by_stage: bool = False,
296
+ ):
297
+ self.auto_create_handle_loop()
298
+ env_with_stack: bool = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "true")
299
+ with_stack = False if with_stack is False or env_with_stack is False else True
300
+ req = ProfileReq(
301
+ type=ProfileReqType.START_PROFILE,
302
+ output_dir=output_dir,
303
+ start_step=start_step,
304
+ num_steps=num_steps,
305
+ activities=activities,
306
+ with_stack=with_stack,
307
+ record_shapes=record_shapes,
308
+ profile_by_stage=profile_by_stage,
309
+ profile_id=str(time.time()),
310
+ )
311
+ return await self._execute_profile(req)
312
+
313
+ async def stop_profile(self: TokenizerManager):
314
+ self.auto_create_handle_loop()
315
+ req = ProfileReq(type=ProfileReqType.STOP_PROFILE)
316
+ return await self._execute_profile(req)
317
+
318
+ async def _execute_profile(self: TokenizerManager, req: ProfileReq):
319
+ result = (await self.profile_communicator(req))[0]
320
+ if not result.success:
321
+ raise RuntimeError(result.message)
322
+ return result
323
+
324
+ async def start_expert_distribution_record(self: TokenizerManager):
325
+ self.auto_create_handle_loop()
326
+ await self.expert_distribution_communicator(ExpertDistributionReq.START_RECORD)
327
+
328
+ async def stop_expert_distribution_record(self: TokenizerManager):
329
+ self.auto_create_handle_loop()
330
+ await self.expert_distribution_communicator(ExpertDistributionReq.STOP_RECORD)
331
+
332
+ async def dump_expert_distribution_record(self: TokenizerManager):
333
+ self.auto_create_handle_loop()
334
+ await self.expert_distribution_communicator(ExpertDistributionReq.DUMP_RECORD)
335
+
336
+ async def init_weights_update_group(
337
+ self: TokenizerManager,
338
+ obj: InitWeightsUpdateGroupReqInput,
339
+ request: Optional[fastapi.Request] = None,
340
+ ) -> Tuple[bool, str]:
341
+ self.auto_create_handle_loop()
342
+ assert (
343
+ self.server_args.dp_size == 1
344
+ ), "dp_size must be 1 for init parameter update group"
345
+ result = (await self.init_weights_update_group_communicator(obj))[0]
346
+ return result.success, result.message
347
+
348
+ async def update_weights_from_distributed(
349
+ self: TokenizerManager,
350
+ obj: UpdateWeightsFromDistributedReqInput,
351
+ request: Optional[fastapi.Request] = None,
352
+ ) -> Tuple[bool, str]:
353
+ self.auto_create_handle_loop()
354
+ assert (
355
+ self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
356
+ ), "dp_size must be 1 or dp attention must be enabled for update weights from distributed"
357
+
358
+ if obj.abort_all_requests:
359
+ self.abort_request(abort_all=True)
360
+
361
+ # This means that weight sync
362
+ # cannot run while requests are in progress.
363
+ async with self.model_update_lock.writer_lock:
364
+ result = (await self.update_weights_from_distributed_communicator(obj))[0]
365
+ return result.success, result.message
366
+
367
+ async def init_weights_send_group_for_remote_instance(
368
+ self,
369
+ obj: InitWeightsSendGroupForRemoteInstanceReqInput,
370
+ request: Optional[fastapi.Request] = None,
371
+ ) -> Tuple[bool, str]:
372
+ self.auto_create_handle_loop()
373
+ # TODO: support DP
374
+ assert (
375
+ self.server_args.dp_size == 1
376
+ ), "dp_size must be 1 for init_weights_send_group_for_remote_instance"
377
+ result = (
378
+ await self.init_weights_send_group_for_remote_instance_communicator(obj)
379
+ )[0]
380
+ return result.success, result.message
381
+
382
+ async def send_weights_to_remote_instance(
383
+ self,
384
+ obj: SendWeightsToRemoteInstanceReqInput,
385
+ request: Optional[fastapi.Request] = None,
386
+ ) -> Tuple[bool, str]:
387
+ self.auto_create_handle_loop()
388
+ # TODO: support DP
389
+ assert (
390
+ self.server_args.dp_size == 1
391
+ ), "dp_size must be 1 for send_weights_to_remote_instance"
392
+ result = (await self.send_weights_to_remote_instance_communicator(obj))[0]
393
+ return result.success, result.message
394
+
395
+ async def update_weights_from_tensor(
396
+ self: TokenizerManager,
397
+ obj: UpdateWeightsFromTensorReqInput,
398
+ request: Optional[fastapi.Request] = None,
399
+ ) -> Tuple[bool, str]:
400
+ self.auto_create_handle_loop()
401
+ assert (
402
+ self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
403
+ ), "dp_size must be 1 or dp attention must be enabled for update weights from tensor"
404
+
405
+ if obj.abort_all_requests:
406
+ self.abort_request(abort_all=True)
407
+
408
+ # This means that weight sync
409
+ # cannot run while requests are in progress.
410
+ async with self.model_update_lock.writer_lock:
411
+ result = (await self.update_weights_from_tensor_communicator(obj))[0]
412
+ return result.success, result.message
413
+
414
+ async def load_lora_adapter(
415
+ self: TokenizerManager,
416
+ obj: LoadLoRAAdapterReqInput,
417
+ _: Optional[fastapi.Request] = None,
418
+ ) -> LoadLoRAAdapterReqOutput:
419
+ self.auto_create_handle_loop()
420
+
421
+ try:
422
+ if not self.server_args.enable_lora:
423
+ raise ValueError(
424
+ "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
425
+ )
426
+
427
+ # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
428
+ # with dp_size > 1.
429
+ assert (
430
+ self.server_args.dp_size == 1
431
+ ), "dp_size must be 1 for dynamic lora loading"
432
+ logger.info(
433
+ "Start load Lora adapter. Lora name=%s, path=%s",
434
+ obj.lora_name,
435
+ obj.lora_path,
436
+ )
437
+
438
+ async with self.lora_update_lock:
439
+ if (
440
+ self.server_args.max_loaded_loras is not None
441
+ and self.lora_registry.num_registered_loras
442
+ >= self.server_args.max_loaded_loras
443
+ ):
444
+ raise ValueError(
445
+ f"Cannot load LoRA adapter {obj.lora_name} at path {obj.lora_path}. "
446
+ f"Maximum number of loaded LoRA adapters is {self.server_args.max_loaded_loras}. "
447
+ "Please unload some LoRA adapters before loading new ones."
448
+ )
449
+
450
+ # Generate new uniquely identifiable LoRARef object.
451
+ new_adapter = LoRARef(
452
+ lora_name=obj.lora_name,
453
+ lora_path=obj.lora_path,
454
+ pinned=obj.pinned,
455
+ )
456
+
457
+ # Trigger the actual loading operation at the backend processes.
458
+ obj.lora_id = new_adapter.lora_id
459
+ result = (await self.update_lora_adapter_communicator(obj))[0]
460
+
461
+ # Register the LoRA adapter only after loading is successful.
462
+ if result.success:
463
+ await self.lora_registry.register(new_adapter)
464
+
465
+ return result
466
+ except ValueError as e:
467
+ return LoadLoRAAdapterReqOutput(
468
+ success=False,
469
+ error_message=str(e),
470
+ )
471
+
472
+ async def unload_lora_adapter(
473
+ self: TokenizerManager,
474
+ obj: UnloadLoRAAdapterReqInput,
475
+ _: Optional[fastapi.Request] = None,
476
+ ) -> UnloadLoRAAdapterReqOutput:
477
+ self.auto_create_handle_loop()
478
+
479
+ try:
480
+ if not self.server_args.enable_lora:
481
+ raise ValueError(
482
+ "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
483
+ )
484
+
485
+ assert (
486
+ obj.lora_name is not None
487
+ ), "lora_name must be provided to unload LoRA adapter"
488
+
489
+ # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
490
+ # with dp_size > 1.
491
+ assert (
492
+ self.server_args.dp_size == 1
493
+ ), "dp_size must be 1 for dynamic lora loading"
494
+ logger.info(
495
+ "Start unload Lora adapter. Lora name=%s",
496
+ obj.lora_name,
497
+ )
498
+
499
+ async with self.lora_update_lock:
500
+ # Unregister the LoRA adapter from the registry to stop new requests for this adapter
501
+ # from being started.
502
+ lora_id = await self.lora_registry.unregister(obj.lora_name)
503
+ obj.lora_id = lora_id
504
+
505
+ # Initiate the actual unloading operation at the backend processes only after all
506
+ # ongoing requests using this LoRA adapter are finished.
507
+ await self.lora_registry.wait_for_unload(lora_id)
508
+ result = (await self.update_lora_adapter_communicator(obj))[0]
509
+
510
+ return result
511
+ except ValueError as e:
512
+ return UnloadLoRAAdapterReqOutput(success=False, error_message=str(e))
513
+
514
+ async def get_weights_by_name(
515
+ self: TokenizerManager,
516
+ obj: GetWeightsByNameReqInput,
517
+ request: Optional[fastapi.Request] = None,
518
+ ):
519
+ self.auto_create_handle_loop()
520
+ results = await self.get_weights_by_name_communicator(obj)
521
+ all_parameters = [r.parameter for r in results]
522
+ if self.server_args.dp_size == 1:
523
+ return all_parameters[0]
524
+ else:
525
+ return all_parameters
526
+
527
+ async def release_memory_occupation(
528
+ self: TokenizerManager,
529
+ obj: ReleaseMemoryOccupationReqInput,
530
+ request: Optional[fastapi.Request] = None,
531
+ ):
532
+ self.auto_create_handle_loop()
533
+ await self.release_memory_occupation_communicator(obj)
534
+
535
+ async def resume_memory_occupation(
536
+ self: TokenizerManager,
537
+ obj: ResumeMemoryOccupationReqInput,
538
+ request: Optional[fastapi.Request] = None,
539
+ ):
540
+ self.auto_create_handle_loop()
541
+ await self.resume_memory_occupation_communicator(obj)
542
+
543
+ async def slow_down(
544
+ self: TokenizerManager,
545
+ obj: SlowDownReqInput,
546
+ request: Optional[fastapi.Request] = None,
547
+ ):
548
+ self.auto_create_handle_loop()
549
+ await self.slow_down_communicator(obj)
550
+
551
+ async def get_internal_state(self: TokenizerManager) -> List[Dict[Any, Any]]:
552
+ req = GetInternalStateReq()
553
+ responses: List[GetInternalStateReqOutput] = (
554
+ await self.get_internal_state_communicator(req)
555
+ )
556
+ # Many DP ranks
557
+ return [res.internal_state for res in responses]
558
+
559
+ async def set_internal_state(
560
+ self: TokenizerManager, obj: SetInternalStateReq
561
+ ) -> List[bool]:
562
+ responses: List[SetInternalStateReqOutput] = (
563
+ await self.set_internal_state_communicator(obj)
564
+ )
565
+ return [res.updated for res in responses]
566
+
567
+ async def get_load(self: TokenizerManager) -> List[GetLoadReqOutput]:
568
+ req = GetLoadReqInput()
569
+ return await self.get_load_communicator(req)