sglang 0.4.7.post1__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. sglang/bench_one_batch.py +8 -6
  2. sglang/srt/_custom_ops.py +2 -2
  3. sglang/srt/code_completion_parser.py +2 -44
  4. sglang/srt/configs/model_config.py +1 -0
  5. sglang/srt/constants.py +3 -0
  6. sglang/srt/conversation.py +14 -3
  7. sglang/srt/custom_op.py +11 -1
  8. sglang/srt/disaggregation/base/conn.py +2 -0
  9. sglang/srt/disaggregation/decode.py +22 -28
  10. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
  11. sglang/srt/disaggregation/mini_lb.py +34 -4
  12. sglang/srt/disaggregation/mooncake/conn.py +301 -64
  13. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
  14. sglang/srt/disaggregation/nixl/conn.py +94 -46
  15. sglang/srt/disaggregation/prefill.py +20 -15
  16. sglang/srt/disaggregation/utils.py +47 -18
  17. sglang/srt/distributed/parallel_state.py +12 -4
  18. sglang/srt/entrypoints/engine.py +27 -31
  19. sglang/srt/entrypoints/http_server.py +149 -79
  20. sglang/srt/entrypoints/http_server_engine.py +0 -3
  21. sglang/srt/entrypoints/openai/__init__.py +0 -0
  22. sglang/srt/{openai_api → entrypoints/openai}/protocol.py +115 -34
  23. sglang/srt/entrypoints/openai/serving_base.py +149 -0
  24. sglang/srt/entrypoints/openai/serving_chat.py +897 -0
  25. sglang/srt/entrypoints/openai/serving_completions.py +425 -0
  26. sglang/srt/entrypoints/openai/serving_embedding.py +170 -0
  27. sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
  28. sglang/srt/entrypoints/openai/serving_score.py +61 -0
  29. sglang/srt/entrypoints/openai/usage_processor.py +81 -0
  30. sglang/srt/entrypoints/openai/utils.py +72 -0
  31. sglang/srt/function_call/base_format_detector.py +7 -4
  32. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  33. sglang/srt/function_call/ebnf_composer.py +64 -10
  34. sglang/srt/function_call/function_call_parser.py +6 -6
  35. sglang/srt/function_call/llama32_detector.py +1 -1
  36. sglang/srt/function_call/mistral_detector.py +1 -1
  37. sglang/srt/function_call/pythonic_detector.py +1 -1
  38. sglang/srt/function_call/qwen25_detector.py +1 -1
  39. sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
  40. sglang/srt/layers/activation.py +28 -3
  41. sglang/srt/layers/attention/aiter_backend.py +5 -2
  42. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  43. sglang/srt/layers/attention/cutlass_mla_backend.py +1 -0
  44. sglang/srt/layers/attention/flashattention_backend.py +43 -23
  45. sglang/srt/layers/attention/flashinfer_backend.py +9 -6
  46. sglang/srt/layers/attention/flashinfer_mla_backend.py +7 -4
  47. sglang/srt/layers/attention/flashmla_backend.py +5 -2
  48. sglang/srt/layers/attention/tbo_backend.py +3 -3
  49. sglang/srt/layers/attention/triton_backend.py +19 -11
  50. sglang/srt/layers/communicator.py +5 -5
  51. sglang/srt/layers/dp_attention.py +11 -2
  52. sglang/srt/layers/layernorm.py +44 -2
  53. sglang/srt/layers/linear.py +18 -1
  54. sglang/srt/layers/logits_processor.py +14 -5
  55. sglang/srt/layers/moe/ep_moe/kernels.py +159 -2
  56. sglang/srt/layers/moe/ep_moe/layer.py +286 -13
  57. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +19 -2
  58. sglang/srt/layers/moe/fused_moe_native.py +7 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +13 -2
  61. sglang/srt/layers/moe/fused_moe_triton/layer.py +148 -26
  62. sglang/srt/layers/moe/topk.py +117 -4
  63. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
  64. sglang/srt/layers/quantization/fp8.py +25 -17
  65. sglang/srt/layers/quantization/fp8_utils.py +5 -4
  66. sglang/srt/layers/quantization/modelopt_quant.py +62 -8
  67. sglang/srt/layers/quantization/utils.py +5 -2
  68. sglang/srt/layers/rotary_embedding.py +144 -12
  69. sglang/srt/layers/sampler.py +1 -1
  70. sglang/srt/layers/vocab_parallel_embedding.py +14 -1
  71. sglang/srt/lora/lora_manager.py +173 -74
  72. sglang/srt/lora/mem_pool.py +49 -45
  73. sglang/srt/lora/utils.py +1 -1
  74. sglang/srt/managers/cache_controller.py +33 -15
  75. sglang/srt/managers/expert_distribution.py +21 -0
  76. sglang/srt/managers/io_struct.py +19 -14
  77. sglang/srt/managers/multimodal_processors/base_processor.py +44 -9
  78. sglang/srt/managers/multimodal_processors/gemma3n.py +97 -0
  79. sglang/srt/managers/schedule_batch.py +49 -32
  80. sglang/srt/managers/schedule_policy.py +70 -56
  81. sglang/srt/managers/scheduler.py +189 -68
  82. sglang/srt/managers/template_manager.py +226 -0
  83. sglang/srt/managers/tokenizer_manager.py +11 -8
  84. sglang/srt/managers/tp_worker.py +12 -2
  85. sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
  86. sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
  87. sglang/srt/mem_cache/base_prefix_cache.py +52 -8
  88. sglang/srt/mem_cache/chunk_cache.py +11 -16
  89. sglang/srt/mem_cache/hiradix_cache.py +34 -23
  90. sglang/srt/mem_cache/memory_pool.py +118 -114
  91. sglang/srt/mem_cache/radix_cache.py +20 -16
  92. sglang/srt/model_executor/cuda_graph_runner.py +77 -46
  93. sglang/srt/model_executor/forward_batch_info.py +18 -5
  94. sglang/srt/model_executor/model_runner.py +27 -8
  95. sglang/srt/model_loader/loader.py +50 -8
  96. sglang/srt/model_loader/weight_utils.py +100 -2
  97. sglang/srt/models/deepseek_nextn.py +35 -30
  98. sglang/srt/models/deepseek_v2.py +255 -30
  99. sglang/srt/models/gemma3n_audio.py +949 -0
  100. sglang/srt/models/gemma3n_causal.py +1009 -0
  101. sglang/srt/models/gemma3n_mm.py +511 -0
  102. sglang/srt/models/glm4.py +312 -0
  103. sglang/srt/models/hunyuan.py +771 -0
  104. sglang/srt/models/mimo_mtp.py +2 -18
  105. sglang/srt/reasoning_parser.py +21 -11
  106. sglang/srt/server_args.py +51 -9
  107. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -10
  108. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +125 -12
  109. sglang/srt/speculative/eagle_utils.py +80 -8
  110. sglang/srt/speculative/eagle_worker.py +124 -41
  111. sglang/srt/torch_memory_saver_adapter.py +19 -15
  112. sglang/srt/two_batch_overlap.py +4 -1
  113. sglang/srt/utils.py +248 -11
  114. sglang/test/test_block_fp8_ep.py +1 -0
  115. sglang/test/test_utils.py +1 -0
  116. sglang/version.py +1 -1
  117. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/METADATA +4 -10
  118. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/RECORD +121 -105
  119. sglang/srt/entrypoints/verl_engine.py +0 -179
  120. sglang/srt/openai_api/adapter.py +0 -2148
  121. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/WHEEL +0 -0
  122. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/licenses/LICENSE +0 -0
  123. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,312 @@
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+
15
+ # Modeling from:
16
+ # ./llama.py and
17
+ # https://github.com/huggingface/transformers/blob/main/src/transformers/models/glm4/modular_glm4.py
18
+ """Inference-only GLM4 model compatible with THUDM weights."""
19
+
20
+ from typing import Iterable, List, Optional, Tuple, Union
21
+
22
+ import torch
23
+ from torch import nn
24
+ from transformers import Glm4Config
25
+
26
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
27
+ from sglang.srt.layers.layernorm import RMSNorm
28
+ from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear
29
+ from sglang.srt.layers.logits_processor import LogitsProcessor
30
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
31
+ from sglang.srt.layers.radix_attention import RadixAttention
32
+ from sglang.srt.layers.rotary_embedding import get_rope
33
+ from sglang.srt.layers.vocab_parallel_embedding import (
34
+ ParallelLMHead,
35
+ VocabParallelEmbedding,
36
+ )
37
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
38
+ from sglang.srt.model_loader.weight_utils import default_weight_loader
39
+ from sglang.srt.models.llama import LlamaMLP as Glm4MLP
40
+ from sglang.srt.utils import add_prefix, make_layers
41
+
42
+
43
+ class Glm4Attention(nn.Module):
44
+ def __init__(
45
+ self,
46
+ config,
47
+ layer_id: int = 0,
48
+ quant_config: Optional[QuantizationConfig] = None,
49
+ prefix: str = "",
50
+ ):
51
+ super().__init__()
52
+ self.hidden_size = config.hidden_size
53
+ tp_size = get_tensor_model_parallel_world_size()
54
+ self.total_num_heads = config.num_attention_heads
55
+ assert self.total_num_heads % tp_size == 0
56
+ self.num_heads = self.total_num_heads // tp_size
57
+ self.total_num_kv_heads = config.num_key_value_heads
58
+ if self.total_num_kv_heads >= tp_size:
59
+ # Number of KV heads is greater than TP size, so we partition
60
+ # the KV heads across multiple tensor parallel GPUs.
61
+ assert self.total_num_kv_heads % tp_size == 0
62
+ else:
63
+ # Number of KV heads is less than TP size, so we replicate
64
+ # the KV heads across multiple tensor parallel GPUs.
65
+ assert tp_size % self.total_num_kv_heads == 0
66
+ partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
67
+ self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
68
+ self.head_dim = config.hidden_size // self.total_num_heads
69
+ self.q_size = self.num_heads * self.head_dim
70
+ self.kv_size = self.num_kv_heads * self.head_dim
71
+ self.scaling = self.head_dim**-0.5
72
+ self.rope_theta = getattr(config, "rope_theta", 1000000)
73
+ self.rope_scaling = getattr(config, "rope_scaling", None)
74
+
75
+ self.qkv_proj = QKVParallelLinear(
76
+ self.hidden_size,
77
+ self.head_dim,
78
+ self.total_num_heads,
79
+ self.total_num_kv_heads,
80
+ bias=config.attention_bias,
81
+ quant_config=quant_config,
82
+ prefix=add_prefix("qkv_proj", prefix),
83
+ )
84
+ self.o_proj = RowParallelLinear(
85
+ self.total_num_heads * self.head_dim,
86
+ self.hidden_size,
87
+ bias=False,
88
+ quant_config=quant_config,
89
+ prefix=add_prefix("o_proj", prefix),
90
+ )
91
+
92
+ self.rotary_emb = get_rope(
93
+ self.head_dim,
94
+ rotary_dim=self.head_dim,
95
+ max_position=config.max_position_embeddings,
96
+ base=self.rope_theta,
97
+ rope_scaling=self.rope_scaling,
98
+ partial_rotary_factor=partial_rotary_factor,
99
+ is_neox_style=False,
100
+ )
101
+ self.attn = RadixAttention(
102
+ self.num_heads,
103
+ self.head_dim,
104
+ self.scaling,
105
+ num_kv_heads=self.num_kv_heads,
106
+ layer_id=layer_id,
107
+ quant_config=quant_config,
108
+ prefix=add_prefix("attn", prefix),
109
+ )
110
+
111
+ def forward(
112
+ self,
113
+ positions: torch.Tensor,
114
+ hidden_states: torch.Tensor,
115
+ forward_batch: ForwardBatch,
116
+ ) -> torch.Tensor:
117
+ qkv, _ = self.qkv_proj(hidden_states)
118
+ q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
119
+ q, k = self.rotary_emb(positions, q, k)
120
+ context_layer = self.attn(
121
+ q,
122
+ k,
123
+ v,
124
+ forward_batch,
125
+ )
126
+ attn_output, _ = self.o_proj(context_layer)
127
+ return attn_output
128
+
129
+
130
+ class Glm4DecoderLayer(nn.Module):
131
+ """A single transformer layer.
132
+
133
+ Transformer layer takes input with size [s, b, h] and returns an
134
+ output of the same size.
135
+ """
136
+
137
+ def __init__(
138
+ self,
139
+ config,
140
+ layer_id: int,
141
+ quant_config: Optional[QuantizationConfig] = None,
142
+ prefix: str = "",
143
+ ):
144
+ super().__init__()
145
+ # Self attention.
146
+ self.self_attn = Glm4Attention(
147
+ config, layer_id, quant_config, prefix=add_prefix("self_attn", prefix)
148
+ )
149
+
150
+ # MLP
151
+ self.mlp = Glm4MLP(
152
+ config.hidden_size,
153
+ intermediate_size=config.intermediate_size,
154
+ hidden_act=config.hidden_act,
155
+ quant_config=quant_config,
156
+ prefix=add_prefix("mlp", prefix),
157
+ )
158
+
159
+ self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
160
+ self.post_attention_layernorm = RMSNorm(
161
+ config.hidden_size, eps=config.rms_norm_eps
162
+ )
163
+ self.post_self_attn_layernorm = RMSNorm(
164
+ config.hidden_size, eps=config.rms_norm_eps
165
+ )
166
+ self.post_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
167
+
168
+ def forward(
169
+ self,
170
+ positions: torch.Tensor,
171
+ hidden_states: torch.Tensor,
172
+ forward_batch: ForwardBatch,
173
+ residual: Optional[torch.Tensor],
174
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
175
+ # Self Attention
176
+ if residual is None:
177
+ residual = hidden_states
178
+ hidden_states = self.input_layernorm(hidden_states)
179
+ else:
180
+ hidden_states, residual = self.input_layernorm(hidden_states, residual)
181
+ hidden_states = self.self_attn(
182
+ positions=positions,
183
+ hidden_states=hidden_states,
184
+ forward_batch=forward_batch,
185
+ )
186
+ hidden_states = self.post_self_attn_layernorm(hidden_states)
187
+
188
+ # Fully Connected
189
+ hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
190
+ hidden_states = self.mlp(hidden_states)
191
+ hidden_states = self.post_mlp_layernorm(hidden_states)
192
+
193
+ return hidden_states, residual
194
+
195
+
196
+ class Glm4Model(nn.Module):
197
+ def __init__(
198
+ self,
199
+ config: Glm4Config,
200
+ quant_config: Optional[QuantizationConfig] = None,
201
+ prefix: str = "",
202
+ ) -> None:
203
+ super().__init__()
204
+ self.config = config
205
+ self.embed_tokens = VocabParallelEmbedding(
206
+ config.vocab_size,
207
+ config.hidden_size,
208
+ quant_config=quant_config,
209
+ prefix=add_prefix("embed_tokens", prefix),
210
+ )
211
+ self.layers = make_layers(
212
+ config.num_hidden_layers,
213
+ lambda idx, prefix: Glm4DecoderLayer(
214
+ config=config, layer_id=idx, quant_config=quant_config, prefix=prefix
215
+ ),
216
+ prefix="model.layers",
217
+ )
218
+
219
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
220
+
221
+ @torch.no_grad()
222
+ def forward(
223
+ self,
224
+ input_ids: torch.Tensor,
225
+ positions: torch.Tensor,
226
+ forward_batch: ForwardBatch,
227
+ input_embeds: torch.Tensor = None,
228
+ ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]:
229
+ if input_embeds is None:
230
+ hidden_states = self.embed_tokens(input_ids)
231
+ else:
232
+ hidden_states = input_embeds
233
+ residual = None
234
+ for layer in self.layers:
235
+ hidden_states, residual = layer(
236
+ positions,
237
+ hidden_states,
238
+ forward_batch,
239
+ residual,
240
+ )
241
+ hidden_states, _ = self.norm(hidden_states, residual)
242
+
243
+ return hidden_states
244
+
245
+
246
+ class Glm4ForCausalLM(nn.Module):
247
+ def __init__(
248
+ self,
249
+ config: Glm4Config,
250
+ quant_config: Optional[QuantizationConfig] = None,
251
+ prefix: str = "",
252
+ ):
253
+ super().__init__()
254
+ self.config: Glm4Config = config
255
+ self.quant_config = quant_config
256
+ self.model = Glm4Model(config, quant_config, add_prefix("model", prefix))
257
+ if config.tie_word_embeddings:
258
+ self.lm_head = self.model.embed_tokens
259
+ else:
260
+ self.lm_head = ParallelLMHead(
261
+ config.vocab_size,
262
+ config.hidden_size,
263
+ quant_config=quant_config,
264
+ prefix="lm_head",
265
+ )
266
+ self.logits_processor = LogitsProcessor(config)
267
+
268
+ @torch.no_grad()
269
+ def forward(
270
+ self,
271
+ input_ids: torch.Tensor,
272
+ positions: torch.Tensor,
273
+ forward_batch: ForwardBatch,
274
+ ) -> torch.Tensor:
275
+ hidden_states = self.model(input_ids, positions, forward_batch)
276
+ return self.logits_processor(
277
+ input_ids, hidden_states, self.lm_head, forward_batch
278
+ )
279
+
280
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
281
+ stacked_params_mapping = [
282
+ # (param_name, weight_name, shard_id)
283
+ (".qkv_proj", ".q_proj", "q"),
284
+ (".qkv_proj", ".k_proj", "k"),
285
+ (".qkv_proj", ".v_proj", "v"),
286
+ (".gate_up_proj", ".gate_proj", 0),
287
+ (".gate_up_proj", ".up_proj", 1),
288
+ ]
289
+ params_dict = dict(self.named_parameters())
290
+ for name, loaded_weight in weights:
291
+ if self.config.tie_word_embeddings and "lm_head.weight" in name:
292
+ continue
293
+ for param_name, weight_name, shard_id in stacked_params_mapping:
294
+ if weight_name not in name:
295
+ continue
296
+ name = name.replace(weight_name, param_name)
297
+ param = params_dict[name]
298
+ weight_loader = param.weight_loader
299
+ weight_loader(param, loaded_weight, shard_id)
300
+ break
301
+ else:
302
+ if name in params_dict.keys():
303
+ param = params_dict[name]
304
+ weight_loader = getattr(
305
+ param, "weight_loader", default_weight_loader
306
+ )
307
+ weight_loader(param, loaded_weight)
308
+ else:
309
+ raise KeyError(f"Parameter '{name}' not found in model.")
310
+
311
+
312
+ EntryClass = [Glm4ForCausalLM]