sglang 0.5.1.post3__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +10 -1
  3. sglang/bench_serving.py +251 -26
  4. sglang/lang/interpreter.py +1 -1
  5. sglang/srt/configs/__init__.py +4 -0
  6. sglang/srt/configs/internvl.py +6 -0
  7. sglang/srt/configs/longcat_flash.py +104 -0
  8. sglang/srt/configs/model_config.py +37 -7
  9. sglang/srt/configs/qwen3_next.py +326 -0
  10. sglang/srt/connector/__init__.py +1 -1
  11. sglang/srt/connector/base_connector.py +1 -2
  12. sglang/srt/connector/redis.py +2 -2
  13. sglang/srt/connector/serde/__init__.py +1 -1
  14. sglang/srt/connector/serde/safe_serde.py +4 -3
  15. sglang/srt/custom_op.py +11 -1
  16. sglang/srt/debug_utils/dump_comparator.py +81 -44
  17. sglang/srt/debug_utils/dump_loader.py +97 -0
  18. sglang/srt/debug_utils/dumper.py +11 -3
  19. sglang/srt/debug_utils/text_comparator.py +73 -11
  20. sglang/srt/disaggregation/ascend/conn.py +75 -0
  21. sglang/srt/disaggregation/base/conn.py +1 -1
  22. sglang/srt/disaggregation/common/conn.py +15 -12
  23. sglang/srt/disaggregation/decode.py +6 -4
  24. sglang/srt/disaggregation/fake/conn.py +1 -1
  25. sglang/srt/disaggregation/mini_lb.py +6 -420
  26. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  27. sglang/srt/disaggregation/nixl/conn.py +180 -16
  28. sglang/srt/disaggregation/prefill.py +6 -4
  29. sglang/srt/disaggregation/utils.py +5 -50
  30. sglang/srt/distributed/parallel_state.py +94 -58
  31. sglang/srt/entrypoints/engine.py +34 -14
  32. sglang/srt/entrypoints/http_server.py +172 -47
  33. sglang/srt/entrypoints/openai/protocol.py +63 -3
  34. sglang/srt/entrypoints/openai/serving_base.py +6 -2
  35. sglang/srt/entrypoints/openai/serving_chat.py +34 -19
  36. sglang/srt/entrypoints/openai/serving_completions.py +10 -4
  37. sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
  38. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  39. sglang/srt/eplb/eplb_manager.py +28 -4
  40. sglang/srt/eplb/expert_distribution.py +55 -15
  41. sglang/srt/eplb/expert_location.py +8 -3
  42. sglang/srt/eplb/expert_location_updater.py +1 -1
  43. sglang/srt/function_call/ebnf_composer.py +11 -9
  44. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  45. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  46. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  47. sglang/srt/hf_transformers_utils.py +12 -0
  48. sglang/srt/layers/activation.py +44 -9
  49. sglang/srt/layers/attention/aiter_backend.py +93 -68
  50. sglang/srt/layers/attention/ascend_backend.py +250 -112
  51. sglang/srt/layers/attention/fla/chunk.py +242 -0
  52. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  53. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  54. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  55. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  56. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  57. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  58. sglang/srt/layers/attention/fla/index.py +37 -0
  59. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  60. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  61. sglang/srt/layers/attention/fla/op.py +66 -0
  62. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  63. sglang/srt/layers/attention/fla/utils.py +331 -0
  64. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  65. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  66. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  67. sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
  68. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
  69. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  70. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  71. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  72. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  73. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  74. sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
  75. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  76. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  77. sglang/srt/layers/communicator.py +45 -7
  78. sglang/srt/layers/layernorm.py +54 -12
  79. sglang/srt/layers/logits_processor.py +10 -3
  80. sglang/srt/layers/moe/__init__.py +2 -1
  81. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
  82. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  83. sglang/srt/layers/moe/ep_moe/layer.py +110 -49
  84. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  85. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  91. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  92. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  93. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
  94. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  95. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
  96. sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
  97. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  98. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  99. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  100. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  101. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  102. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  103. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  104. sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
  105. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  106. sglang/srt/layers/moe/topk.py +43 -12
  107. sglang/srt/layers/moe/utils.py +6 -5
  108. sglang/srt/layers/quantization/awq.py +19 -7
  109. sglang/srt/layers/quantization/base_config.py +11 -6
  110. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  111. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  112. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  113. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +9 -1
  114. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
  115. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  116. sglang/srt/layers/quantization/fp8.py +76 -47
  117. sglang/srt/layers/quantization/fp8_utils.py +43 -29
  118. sglang/srt/layers/quantization/gptq.py +25 -17
  119. sglang/srt/layers/quantization/modelopt_quant.py +107 -40
  120. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  121. sglang/srt/layers/quantization/mxfp4.py +77 -45
  122. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  123. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  124. sglang/srt/layers/quantization/quark/utils.py +97 -0
  125. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  126. sglang/srt/layers/quantization/unquant.py +135 -47
  127. sglang/srt/layers/quantization/utils.py +13 -0
  128. sglang/srt/layers/quantization/w4afp8.py +60 -42
  129. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  130. sglang/srt/layers/quantization/w8a8_int8.py +83 -41
  131. sglang/srt/layers/rocm_linear_utils.py +44 -0
  132. sglang/srt/layers/rotary_embedding.py +28 -19
  133. sglang/srt/layers/sampler.py +29 -5
  134. sglang/srt/lora/backend/base_backend.py +50 -8
  135. sglang/srt/lora/backend/triton_backend.py +90 -2
  136. sglang/srt/lora/layers.py +32 -0
  137. sglang/srt/lora/lora.py +4 -1
  138. sglang/srt/lora/lora_manager.py +35 -112
  139. sglang/srt/lora/mem_pool.py +24 -10
  140. sglang/srt/lora/utils.py +18 -9
  141. sglang/srt/managers/cache_controller.py +242 -278
  142. sglang/srt/managers/data_parallel_controller.py +30 -15
  143. sglang/srt/managers/detokenizer_manager.py +13 -2
  144. sglang/srt/managers/disagg_service.py +46 -0
  145. sglang/srt/managers/io_struct.py +160 -11
  146. sglang/srt/managers/mm_utils.py +6 -1
  147. sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
  148. sglang/srt/managers/schedule_batch.py +27 -44
  149. sglang/srt/managers/schedule_policy.py +4 -3
  150. sglang/srt/managers/scheduler.py +90 -115
  151. sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
  152. sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
  153. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  154. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  155. sglang/srt/managers/template_manager.py +3 -3
  156. sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
  157. sglang/srt/managers/tokenizer_manager.py +41 -477
  158. sglang/srt/managers/tp_worker.py +16 -4
  159. sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
  160. sglang/srt/mem_cache/allocator.py +1 -1
  161. sglang/srt/mem_cache/chunk_cache.py +1 -1
  162. sglang/srt/mem_cache/hicache_storage.py +24 -22
  163. sglang/srt/mem_cache/hiradix_cache.py +184 -101
  164. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  165. sglang/srt/mem_cache/memory_pool.py +324 -41
  166. sglang/srt/mem_cache/memory_pool_host.py +25 -18
  167. sglang/srt/mem_cache/radix_cache.py +5 -6
  168. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  169. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  170. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  171. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  172. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +149 -12
  173. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  174. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  175. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +74 -19
  176. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  177. sglang/srt/mem_cache/swa_radix_cache.py +1 -3
  178. sglang/srt/metrics/collector.py +484 -63
  179. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  180. sglang/srt/metrics/utils.py +48 -0
  181. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  182. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  183. sglang/srt/model_executor/forward_batch_info.py +72 -18
  184. sglang/srt/model_executor/model_runner.py +189 -31
  185. sglang/srt/model_loader/__init__.py +9 -3
  186. sglang/srt/model_loader/loader.py +33 -28
  187. sglang/srt/model_loader/utils.py +12 -0
  188. sglang/srt/model_loader/weight_utils.py +2 -1
  189. sglang/srt/models/deepseek_v2.py +311 -50
  190. sglang/srt/models/gemma3n_mm.py +1 -1
  191. sglang/srt/models/glm4_moe.py +10 -1
  192. sglang/srt/models/glm4v.py +4 -2
  193. sglang/srt/models/gpt_oss.py +5 -18
  194. sglang/srt/models/internvl.py +28 -0
  195. sglang/srt/models/llama4.py +9 -0
  196. sglang/srt/models/llama_eagle3.py +17 -0
  197. sglang/srt/models/longcat_flash.py +1026 -0
  198. sglang/srt/models/longcat_flash_nextn.py +699 -0
  199. sglang/srt/models/minicpmv.py +165 -3
  200. sglang/srt/models/mllama4.py +25 -0
  201. sglang/srt/models/opt.py +637 -0
  202. sglang/srt/models/qwen2.py +33 -3
  203. sglang/srt/models/qwen2_5_vl.py +90 -42
  204. sglang/srt/models/qwen2_moe.py +79 -14
  205. sglang/srt/models/qwen3.py +8 -2
  206. sglang/srt/models/qwen3_moe.py +39 -8
  207. sglang/srt/models/qwen3_next.py +1039 -0
  208. sglang/srt/models/qwen3_next_mtp.py +109 -0
  209. sglang/srt/models/torch_native_llama.py +1 -1
  210. sglang/srt/models/transformers.py +1 -1
  211. sglang/srt/multimodal/processors/base_processor.py +4 -2
  212. sglang/srt/multimodal/processors/glm4v.py +9 -9
  213. sglang/srt/multimodal/processors/internvl.py +141 -129
  214. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  215. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  216. sglang/srt/sampling/sampling_batch_info.py +18 -15
  217. sglang/srt/server_args.py +297 -79
  218. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  219. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  220. sglang/srt/speculative/eagle_worker.py +216 -120
  221. sglang/srt/speculative/spec_info.py +5 -0
  222. sglang/srt/speculative/standalone_worker.py +109 -0
  223. sglang/srt/utils.py +37 -2
  224. sglang/srt/weight_sync/utils.py +1 -1
  225. sglang/test/attention/test_trtllm_mla_backend.py +181 -8
  226. sglang/test/few_shot_gsm8k.py +1 -0
  227. sglang/test/runners.py +4 -0
  228. sglang/test/test_cutlass_moe.py +24 -6
  229. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  230. sglang/test/test_disaggregation_utils.py +66 -0
  231. sglang/test/test_utils.py +25 -1
  232. sglang/utils.py +5 -0
  233. sglang/version.py +1 -1
  234. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/METADATA +11 -9
  235. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/RECORD +243 -194
  236. sglang/srt/disaggregation/launch_lb.py +0 -131
  237. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  238. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  239. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  240. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  241. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  242. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  243. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
  244. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
  245. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,491 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ import os
6
+ import time
7
+ from collections import deque
8
+ from typing import (
9
+ TYPE_CHECKING,
10
+ Any,
11
+ Deque,
12
+ Dict,
13
+ Generic,
14
+ List,
15
+ Optional,
16
+ Tuple,
17
+ TypeVar,
18
+ )
19
+
20
+ import fastapi
21
+
22
+ from sglang.srt.managers.io_struct import (
23
+ ClearHiCacheReqInput,
24
+ ClearHiCacheReqOutput,
25
+ ExpertDistributionReq,
26
+ ExpertDistributionReqOutput,
27
+ FlushCacheReqInput,
28
+ FlushCacheReqOutput,
29
+ GetInternalStateReq,
30
+ GetInternalStateReqOutput,
31
+ GetWeightsByNameReqInput,
32
+ GetWeightsByNameReqOutput,
33
+ InitWeightsUpdateGroupReqInput,
34
+ InitWeightsUpdateGroupReqOutput,
35
+ LoadLoRAAdapterReqInput,
36
+ LoadLoRAAdapterReqOutput,
37
+ LoRAUpdateResult,
38
+ MultiTokenizerWrapper,
39
+ ProfileReq,
40
+ ProfileReqOutput,
41
+ ProfileReqType,
42
+ ReleaseMemoryOccupationReqInput,
43
+ ReleaseMemoryOccupationReqOutput,
44
+ ResumeMemoryOccupationReqInput,
45
+ ResumeMemoryOccupationReqOutput,
46
+ SetInternalStateReq,
47
+ SetInternalStateReqOutput,
48
+ SlowDownReqInput,
49
+ SlowDownReqOutput,
50
+ UnloadLoRAAdapterReqInput,
51
+ UnloadLoRAAdapterReqOutput,
52
+ UpdateWeightsFromDistributedReqInput,
53
+ UpdateWeightsFromDistributedReqOutput,
54
+ UpdateWeightsFromTensorReqInput,
55
+ UpdateWeightsFromTensorReqOutput,
56
+ )
57
+ from sglang.srt.server_args import LoRARef, ServerArgs
58
+ from sglang.srt.utils import get_bool_env_var
59
+ from sglang.utils import TypeBasedDispatcher
60
+
61
+ if TYPE_CHECKING:
62
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
63
+
64
+ T = TypeVar("T")
65
+
66
+ logger = logging.getLogger(__name__)
67
+
68
+
69
+ class _Communicator(Generic[T]):
70
+ """Note: The communicator now only run up to 1 in-flight request at any time."""
71
+
72
+ enable_multi_tokenizer = False
73
+
74
+ def __init__(self, sender, fan_out: int):
75
+ self._sender = sender
76
+ self._fan_out = fan_out
77
+ self._result_event: Optional[asyncio.Event] = None
78
+ self._result_values: Optional[List[T]] = None
79
+ self._ready_queue: Deque[asyncio.Future] = deque()
80
+
81
+ async def __call__(self, obj):
82
+ ready_event = asyncio.Event()
83
+ if self._result_event is not None or len(self._ready_queue) > 0:
84
+ self._ready_queue.append(ready_event)
85
+ await ready_event.wait()
86
+ assert self._result_event is None
87
+ assert self._result_values is None
88
+
89
+ if obj:
90
+ if _Communicator.enable_multi_tokenizer:
91
+ obj = MultiTokenizerWrapper(worker_id=os.getpid(), obj=obj)
92
+ self._sender.send_pyobj(obj)
93
+
94
+ self._result_event = asyncio.Event()
95
+ self._result_values = []
96
+ await self._result_event.wait()
97
+ result_values = self._result_values
98
+ self._result_event = self._result_values = None
99
+
100
+ if len(self._ready_queue) > 0:
101
+ self._ready_queue.popleft().set()
102
+
103
+ return result_values
104
+
105
+ def handle_recv(self, recv_obj: T):
106
+ self._result_values.append(recv_obj)
107
+ if len(self._result_values) == self._fan_out:
108
+ self._result_event.set()
109
+
110
+
111
+ class TokenizerCommunicatorMixin:
112
+ """Mixin class for TokenizerManager to handle communication with the scheduler."""
113
+
114
+ def init_communicators(self: TokenizerManager, server_args: ServerArgs):
115
+ # Communicators
116
+ self.init_weights_update_group_communicator = _Communicator(
117
+ self.send_to_scheduler, server_args.dp_size
118
+ )
119
+ self.update_weights_from_distributed_communicator = _Communicator(
120
+ self.send_to_scheduler, server_args.dp_size
121
+ )
122
+ self.update_weights_from_tensor_communicator = _Communicator(
123
+ self.send_to_scheduler, server_args.dp_size
124
+ )
125
+ self.get_weights_by_name_communicator = _Communicator(
126
+ self.send_to_scheduler, server_args.dp_size
127
+ )
128
+ self.release_memory_occupation_communicator = _Communicator(
129
+ self.send_to_scheduler, server_args.dp_size
130
+ )
131
+ self.resume_memory_occupation_communicator = _Communicator(
132
+ self.send_to_scheduler, server_args.dp_size
133
+ )
134
+ self.slow_down_communicator = _Communicator(
135
+ self.send_to_scheduler, server_args.dp_size
136
+ )
137
+ self.flush_cache_communicator = _Communicator(
138
+ self.send_to_scheduler, server_args.dp_size
139
+ )
140
+ self.clear_hicache_storage_communicator = _Communicator(
141
+ self.send_to_scheduler, server_args.dp_size
142
+ )
143
+ self.profile_communicator = _Communicator(
144
+ self.send_to_scheduler, server_args.dp_size
145
+ )
146
+ self.get_internal_state_communicator = _Communicator(
147
+ self.send_to_scheduler, server_args.dp_size
148
+ )
149
+ self.set_internal_state_communicator = _Communicator(
150
+ self.send_to_scheduler, server_args.dp_size
151
+ )
152
+ self.expert_distribution_communicator = _Communicator(
153
+ self.send_to_scheduler, server_args.dp_size
154
+ )
155
+ self.update_lora_adapter_communicator = _Communicator(
156
+ self.send_to_scheduler, server_args.dp_size
157
+ )
158
+
159
+ self._result_dispatcher += self._get_communicator_dispatcher()
160
+
161
+ def _get_communicator_dispatcher(self: TokenizerManager):
162
+ return TypeBasedDispatcher(
163
+ [
164
+ (
165
+ InitWeightsUpdateGroupReqOutput,
166
+ self.init_weights_update_group_communicator.handle_recv,
167
+ ),
168
+ (
169
+ UpdateWeightsFromDistributedReqOutput,
170
+ self.update_weights_from_distributed_communicator.handle_recv,
171
+ ),
172
+ (
173
+ UpdateWeightsFromTensorReqOutput,
174
+ self.update_weights_from_tensor_communicator.handle_recv,
175
+ ),
176
+ (
177
+ GetWeightsByNameReqOutput,
178
+ self.get_weights_by_name_communicator.handle_recv,
179
+ ),
180
+ (
181
+ ReleaseMemoryOccupationReqOutput,
182
+ self.release_memory_occupation_communicator.handle_recv,
183
+ ),
184
+ (
185
+ ResumeMemoryOccupationReqOutput,
186
+ self.resume_memory_occupation_communicator.handle_recv,
187
+ ),
188
+ (
189
+ SlowDownReqOutput,
190
+ self.slow_down_communicator.handle_recv,
191
+ ),
192
+ (
193
+ ClearHiCacheReqOutput,
194
+ self.clear_hicache_storage_communicator.handle_recv,
195
+ ),
196
+ (
197
+ FlushCacheReqOutput,
198
+ self.flush_cache_communicator.handle_recv,
199
+ ),
200
+ (
201
+ ProfileReqOutput,
202
+ self.profile_communicator.handle_recv,
203
+ ),
204
+ (
205
+ GetInternalStateReqOutput,
206
+ self.get_internal_state_communicator.handle_recv,
207
+ ),
208
+ (
209
+ SetInternalStateReqOutput,
210
+ self.set_internal_state_communicator.handle_recv,
211
+ ),
212
+ (
213
+ ExpertDistributionReqOutput,
214
+ self.expert_distribution_communicator.handle_recv,
215
+ ),
216
+ (
217
+ LoRAUpdateResult,
218
+ self.update_lora_adapter_communicator.handle_recv,
219
+ ),
220
+ ]
221
+ )
222
+
223
+ async def flush_cache(self: TokenizerManager) -> FlushCacheReqOutput:
224
+ return (await self.flush_cache_communicator(FlushCacheReqInput()))[0]
225
+
226
+ async def clear_hicache_storage(self: TokenizerManager) -> ClearHiCacheReqOutput:
227
+ """Clear the hierarchical cache storage."""
228
+ # Delegate to the scheduler to handle HiCacheStorage clearing
229
+ return (await self.clear_hicache_storage_communicator(ClearHiCacheReqInput()))[
230
+ 0
231
+ ]
232
+
233
+ async def start_profile(
234
+ self: TokenizerManager,
235
+ output_dir: Optional[str] = None,
236
+ start_step: Optional[int] = None,
237
+ num_steps: Optional[int] = None,
238
+ activities: Optional[List[str]] = None,
239
+ with_stack: Optional[bool] = None,
240
+ record_shapes: Optional[bool] = None,
241
+ profile_by_stage: bool = False,
242
+ ):
243
+ self.auto_create_handle_loop()
244
+ env_with_stack: bool = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "true")
245
+ with_stack = False if with_stack is False or env_with_stack is False else True
246
+ req = ProfileReq(
247
+ type=ProfileReqType.START_PROFILE,
248
+ output_dir=output_dir,
249
+ start_step=start_step,
250
+ num_steps=num_steps,
251
+ activities=activities,
252
+ with_stack=with_stack,
253
+ record_shapes=record_shapes,
254
+ profile_by_stage=profile_by_stage,
255
+ profile_id=str(time.time()),
256
+ )
257
+ return await self._execute_profile(req)
258
+
259
+ async def stop_profile(self: TokenizerManager):
260
+ self.auto_create_handle_loop()
261
+ req = ProfileReq(type=ProfileReqType.STOP_PROFILE)
262
+ return await self._execute_profile(req)
263
+
264
+ async def _execute_profile(self: TokenizerManager, req: ProfileReq):
265
+ result = (await self.profile_communicator(req))[0]
266
+ if not result.success:
267
+ raise RuntimeError(result.message)
268
+ return result
269
+
270
+ async def start_expert_distribution_record(self: TokenizerManager):
271
+ self.auto_create_handle_loop()
272
+ await self.expert_distribution_communicator(ExpertDistributionReq.START_RECORD)
273
+
274
+ async def stop_expert_distribution_record(self: TokenizerManager):
275
+ self.auto_create_handle_loop()
276
+ await self.expert_distribution_communicator(ExpertDistributionReq.STOP_RECORD)
277
+
278
+ async def dump_expert_distribution_record(self: TokenizerManager):
279
+ self.auto_create_handle_loop()
280
+ await self.expert_distribution_communicator(ExpertDistributionReq.DUMP_RECORD)
281
+
282
+ async def init_weights_update_group(
283
+ self: TokenizerManager,
284
+ obj: InitWeightsUpdateGroupReqInput,
285
+ request: Optional[fastapi.Request] = None,
286
+ ) -> Tuple[bool, str]:
287
+ self.auto_create_handle_loop()
288
+ assert (
289
+ self.server_args.dp_size == 1
290
+ ), "dp_size must be 1 for init parameter update group"
291
+ result = (await self.init_weights_update_group_communicator(obj))[0]
292
+ return result.success, result.message
293
+
294
+ async def update_weights_from_distributed(
295
+ self: TokenizerManager,
296
+ obj: UpdateWeightsFromDistributedReqInput,
297
+ request: Optional[fastapi.Request] = None,
298
+ ) -> Tuple[bool, str]:
299
+ self.auto_create_handle_loop()
300
+ assert (
301
+ self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
302
+ ), "dp_size must be 1 or dp attention must be enabled for update weights from distributed"
303
+
304
+ if obj.abort_all_requests:
305
+ self.abort_request(abort_all=True)
306
+
307
+ # This means that weight sync
308
+ # cannot run while requests are in progress.
309
+ async with self.model_update_lock.writer_lock:
310
+ result = (await self.update_weights_from_distributed_communicator(obj))[0]
311
+ return result.success, result.message
312
+
313
+ async def update_weights_from_tensor(
314
+ self: TokenizerManager,
315
+ obj: UpdateWeightsFromTensorReqInput,
316
+ request: Optional[fastapi.Request] = None,
317
+ ) -> Tuple[bool, str]:
318
+ self.auto_create_handle_loop()
319
+ assert (
320
+ self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
321
+ ), "dp_size must be 1 or dp attention must be enabled for update weights from tensor"
322
+
323
+ if obj.abort_all_requests:
324
+ self.abort_request(abort_all=True)
325
+
326
+ # This means that weight sync
327
+ # cannot run while requests are in progress.
328
+ async with self.model_update_lock.writer_lock:
329
+ result = (await self.update_weights_from_tensor_communicator(obj))[0]
330
+ return result.success, result.message
331
+
332
+ async def load_lora_adapter(
333
+ self: TokenizerManager,
334
+ obj: LoadLoRAAdapterReqInput,
335
+ _: Optional[fastapi.Request] = None,
336
+ ) -> LoadLoRAAdapterReqOutput:
337
+ self.auto_create_handle_loop()
338
+
339
+ try:
340
+ if not self.server_args.enable_lora:
341
+ raise ValueError(
342
+ "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
343
+ )
344
+
345
+ # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
346
+ # with dp_size > 1.
347
+ assert (
348
+ self.server_args.dp_size == 1
349
+ ), "dp_size must be 1 for dynamic lora loading"
350
+ logger.info(
351
+ "Start load Lora adapter. Lora name=%s, path=%s",
352
+ obj.lora_name,
353
+ obj.lora_path,
354
+ )
355
+
356
+ async with self.lora_update_lock:
357
+ if (
358
+ self.server_args.max_loaded_loras is not None
359
+ and self.lora_registry.num_registered_loras
360
+ >= self.server_args.max_loaded_loras
361
+ ):
362
+ raise ValueError(
363
+ f"Cannot load LoRA adapter {obj.lora_name} at path {obj.lora_path}. "
364
+ f"Maximum number of loaded LoRA adapters is {self.server_args.max_loaded_loras}. "
365
+ "Please unload some LoRA adapters before loading new ones."
366
+ )
367
+
368
+ # Generate new uniquely identifiable LoRARef object.
369
+ new_adapter = LoRARef(
370
+ lora_name=obj.lora_name,
371
+ lora_path=obj.lora_path,
372
+ pinned=obj.pinned,
373
+ )
374
+
375
+ # Trigger the actual loading operation at the backend processes.
376
+ obj.lora_id = new_adapter.lora_id
377
+ result = (await self.update_lora_adapter_communicator(obj))[0]
378
+
379
+ # Register the LoRA adapter only after loading is successful.
380
+ if result.success:
381
+ await self.lora_registry.register(new_adapter)
382
+
383
+ return result
384
+ except ValueError as e:
385
+ return LoadLoRAAdapterReqOutput(
386
+ success=False,
387
+ error_message=str(e),
388
+ )
389
+
390
+ async def unload_lora_adapter(
391
+ self: TokenizerManager,
392
+ obj: UnloadLoRAAdapterReqInput,
393
+ _: Optional[fastapi.Request] = None,
394
+ ) -> UnloadLoRAAdapterReqOutput:
395
+ self.auto_create_handle_loop()
396
+
397
+ try:
398
+ if not self.server_args.enable_lora:
399
+ raise ValueError(
400
+ "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
401
+ )
402
+
403
+ assert (
404
+ obj.lora_name is not None
405
+ ), "lora_name must be provided to unload LoRA adapter"
406
+
407
+ # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
408
+ # with dp_size > 1.
409
+ assert (
410
+ self.server_args.dp_size == 1
411
+ ), "dp_size must be 1 for dynamic lora loading"
412
+ logger.info(
413
+ "Start unload Lora adapter. Lora name=%s",
414
+ obj.lora_name,
415
+ )
416
+
417
+ async with self.lora_update_lock:
418
+ # Unregister the LoRA adapter from the registry to stop new requests for this adapter
419
+ # from being started.
420
+ lora_id = await self.lora_registry.unregister(obj.lora_name)
421
+ obj.lora_id = lora_id
422
+
423
+ # Initiate the actual unloading operation at the backend processes only after all
424
+ # ongoing requests using this LoRA adapter are finished.
425
+ await self.lora_registry.wait_for_unload(lora_id)
426
+ result = (await self.update_lora_adapter_communicator(obj))[0]
427
+
428
+ return result
429
+ except ValueError as e:
430
+ return UnloadLoRAAdapterReqOutput(success=False, error_message=str(e))
431
+
432
+ async def get_weights_by_name(
433
+ self: TokenizerManager,
434
+ obj: GetWeightsByNameReqInput,
435
+ request: Optional[fastapi.Request] = None,
436
+ ):
437
+ self.auto_create_handle_loop()
438
+ results = await self.get_weights_by_name_communicator(obj)
439
+ all_parameters = [r.parameter for r in results]
440
+ if self.server_args.dp_size == 1:
441
+ return all_parameters[0]
442
+ else:
443
+ return all_parameters
444
+
445
+ async def release_memory_occupation(
446
+ self: TokenizerManager,
447
+ obj: ReleaseMemoryOccupationReqInput,
448
+ request: Optional[fastapi.Request] = None,
449
+ ):
450
+ self.auto_create_handle_loop()
451
+ await self.release_memory_occupation_communicator(obj)
452
+
453
+ async def resume_memory_occupation(
454
+ self: TokenizerManager,
455
+ obj: ResumeMemoryOccupationReqInput,
456
+ request: Optional[fastapi.Request] = None,
457
+ ):
458
+ self.auto_create_handle_loop()
459
+ await self.resume_memory_occupation_communicator(obj)
460
+
461
+ async def slow_down(
462
+ self: TokenizerManager,
463
+ obj: SlowDownReqInput,
464
+ request: Optional[fastapi.Request] = None,
465
+ ):
466
+ self.auto_create_handle_loop()
467
+ await self.slow_down_communicator(obj)
468
+
469
+ async def get_internal_state(self: TokenizerManager) -> List[Dict[Any, Any]]:
470
+ req = GetInternalStateReq()
471
+ responses: List[GetInternalStateReqOutput] = (
472
+ await self.get_internal_state_communicator(req)
473
+ )
474
+ # Many DP ranks
475
+ return [res.internal_state for res in responses]
476
+
477
+ async def set_internal_state(
478
+ self: TokenizerManager, obj: SetInternalStateReq
479
+ ) -> List[bool]:
480
+ responses: List[SetInternalStateReqOutput] = (
481
+ await self.set_internal_state_communicator(obj)
482
+ )
483
+ return [res.updated for res in responses]
484
+
485
+ async def get_load(self: TokenizerManager) -> dict:
486
+ # TODO(lsyin): fake load report server
487
+ if not self.current_load_lock.locked():
488
+ async with self.current_load_lock:
489
+ internal_state = await self.get_internal_state()
490
+ self.current_load = internal_state[0]["load"]
491
+ return {"load": self.current_load}