sglang 0.5.4__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. sglang/bench_serving.py +56 -12
  2. sglang/launch_server.py +2 -0
  3. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +101 -4
  4. sglang/srt/compilation/backend.py +1 -1
  5. sglang/srt/configs/model_config.py +5 -5
  6. sglang/srt/distributed/parallel_state.py +0 -7
  7. sglang/srt/entrypoints/engine.py +18 -15
  8. sglang/srt/entrypoints/grpc_server.py +0 -1
  9. sglang/srt/entrypoints/http_server.py +75 -94
  10. sglang/srt/environ.py +16 -2
  11. sglang/srt/eplb/expert_distribution.py +30 -0
  12. sglang/srt/function_call/function_call_parser.py +2 -0
  13. sglang/srt/function_call/minimax_m2.py +367 -0
  14. sglang/srt/layers/activation.py +6 -0
  15. sglang/srt/layers/attention/flashattention_backend.py +12 -2
  16. sglang/srt/layers/attention/flashinfer_backend.py +10 -1
  17. sglang/srt/layers/attention/flashinfer_mla_backend.py +18 -10
  18. sglang/srt/layers/attention/trtllm_mla_backend.py +1 -13
  19. sglang/srt/layers/attention/utils.py +78 -0
  20. sglang/srt/layers/communicator.py +1 -0
  21. sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
  22. sglang/srt/layers/layernorm.py +19 -4
  23. sglang/srt/layers/logits_processor.py +5 -0
  24. sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
  25. sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
  26. sglang/srt/layers/moe/ep_moe/layer.py +79 -272
  27. sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
  28. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
  29. sglang/srt/layers/moe/moe_runner/deep_gemm.py +287 -22
  30. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  31. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  32. sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
  33. sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
  34. sglang/srt/layers/moe/token_dispatcher/deepep.py +18 -14
  35. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  36. sglang/srt/layers/moe/topk.py +4 -4
  37. sglang/srt/layers/moe/utils.py +3 -4
  38. sglang/srt/layers/quantization/__init__.py +3 -5
  39. sglang/srt/layers/quantization/awq.py +0 -3
  40. sglang/srt/layers/quantization/base_config.py +7 -0
  41. sglang/srt/layers/quantization/fp8.py +68 -63
  42. sglang/srt/layers/quantization/gguf.py +566 -0
  43. sglang/srt/layers/quantization/mxfp4.py +30 -38
  44. sglang/srt/layers/quantization/unquant.py +23 -45
  45. sglang/srt/layers/quantization/w4afp8.py +38 -2
  46. sglang/srt/layers/radix_attention.py +5 -2
  47. sglang/srt/layers/rotary_embedding.py +13 -1
  48. sglang/srt/layers/sampler.py +12 -1
  49. sglang/srt/managers/io_struct.py +3 -0
  50. sglang/srt/managers/multi_tokenizer_mixin.py +17 -1
  51. sglang/srt/managers/scheduler.py +21 -15
  52. sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
  53. sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
  54. sglang/srt/managers/tokenizer_manager.py +11 -19
  55. sglang/srt/mem_cache/hicache_storage.py +7 -1
  56. sglang/srt/mem_cache/memory_pool.py +82 -0
  57. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  58. sglang/srt/model_executor/forward_batch_info.py +44 -3
  59. sglang/srt/model_executor/model_runner.py +1 -149
  60. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
  61. sglang/srt/models/deepseek_v2.py +147 -44
  62. sglang/srt/models/glm4_moe.py +322 -354
  63. sglang/srt/models/glm4_moe_nextn.py +4 -14
  64. sglang/srt/models/glm4v_moe.py +29 -196
  65. sglang/srt/models/minimax_m2.py +922 -0
  66. sglang/srt/models/nvila.py +355 -0
  67. sglang/srt/models/nvila_lite.py +184 -0
  68. sglang/srt/models/qwen2.py +22 -1
  69. sglang/srt/models/qwen3.py +34 -4
  70. sglang/srt/models/qwen3_moe.py +2 -4
  71. sglang/srt/multimodal/processors/base_processor.py +1 -0
  72. sglang/srt/multimodal/processors/glm4v.py +1 -1
  73. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  74. sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
  75. sglang/srt/parser/reasoning_parser.py +28 -1
  76. sglang/srt/server_args.py +365 -186
  77. sglang/srt/single_batch_overlap.py +2 -7
  78. sglang/srt/utils/common.py +87 -42
  79. sglang/srt/utils/hf_transformers_utils.py +7 -3
  80. sglang/test/test_deterministic.py +235 -12
  81. sglang/test/test_deterministic_utils.py +2 -1
  82. sglang/version.py +1 -1
  83. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +7 -6
  84. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +87 -82
  85. sglang/srt/models/vila.py +0 -306
  86. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  87. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  88. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,8 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
 
15
- """Inference-only GLM-4.5, GLM-4.6 NextN Speculative Decoding."""
15
+ """Inference-only GLM-4.5, GLM-4.6 Speculative Decoding."""
16
+
16
17
  import logging
17
18
  from typing import Iterable, Optional, Tuple
18
19
 
@@ -33,7 +34,7 @@ from sglang.srt.layers.vocab_parallel_embedding import (
33
34
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
34
35
  from sglang.srt.models.glm4_moe import Glm4MoeDecoderLayer, Glm4MoeForCausalLM
35
36
  from sglang.srt.server_args import get_global_server_args
36
- from sglang.srt.utils import BumpAllocator, add_prefix
37
+ from sglang.srt.utils import add_prefix
37
38
 
38
39
  logger = logging.getLogger(__name__)
39
40
 
@@ -84,14 +85,6 @@ class Glm4MoeModelNextN(nn.Module):
84
85
  forward_batch: ForwardBatch,
85
86
  input_embeds: torch.Tensor = None,
86
87
  ) -> torch.Tensor:
87
- zero_allocator = BumpAllocator(
88
- buffer_size=2,
89
- dtype=torch.float32,
90
- device=(
91
- input_embeds.device if input_embeds is not None else input_ids.device
92
- ),
93
- )
94
-
95
88
  if input_embeds is None:
96
89
  hidden_states = self.embed_tokens(input_ids)
97
90
  else:
@@ -111,7 +104,7 @@ class Glm4MoeModelNextN(nn.Module):
111
104
  residual = None
112
105
  with get_global_expert_distribution_recorder().disable_this_region():
113
106
  hidden_states, residual = self.decoder(
114
- positions, hidden_states, forward_batch, residual, zero_allocator
107
+ positions, hidden_states, forward_batch, residual
115
108
  )
116
109
 
117
110
  if not forward_batch.forward_mode.is_idle():
@@ -124,7 +117,6 @@ class Glm4MoeModelNextN(nn.Module):
124
117
 
125
118
 
126
119
  class Glm4MoeForCausalLMNextN(Glm4MoeForCausalLM):
127
-
128
120
  def __init__(
129
121
  self,
130
122
  config: PretrainedConfig,
@@ -135,8 +127,6 @@ class Glm4MoeForCausalLMNextN(Glm4MoeForCausalLM):
135
127
  self.config = config
136
128
  self.tp_size = get_tensor_model_parallel_world_size()
137
129
  self.quant_config = quant_config
138
- self.determine_num_fused_shared_experts("Glm4MoeForCausalLMNextN")
139
-
140
130
  self.model = Glm4MoeModelNextN(
141
131
  config, quant_config, prefix=add_prefix("model", prefix)
142
132
  )
@@ -6,13 +6,10 @@ import torch
6
6
  import torch.nn as nn
7
7
  from transformers.models.glm4v_moe.configuration_glm4v_moe import Glm4vMoeConfig
8
8
 
9
- from sglang.srt.distributed import (
10
- get_moe_expert_parallel_world_size,
11
- get_tensor_model_parallel_world_size,
12
- )
9
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
13
10
  from sglang.srt.layers.attention import vision_utils
14
11
  from sglang.srt.layers.logits_processor import LogitsProcessor
15
- from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
12
+ from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
16
13
  from sglang.srt.layers.pooler import Pooler, PoolingType
17
14
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
18
15
  from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
@@ -20,7 +17,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
20
17
  from sglang.srt.models.glm4_moe import Glm4MoeModel
21
18
  from sglang.srt.models.glm4v import Glm4vForConditionalGeneration, Glm4vVisionModel
22
19
  from sglang.srt.server_args import get_global_server_args
23
- from sglang.srt.utils import add_prefix, is_cuda, log_info_on_rank0
20
+ from sglang.srt.utils import add_prefix, is_cuda
24
21
  from sglang.srt.utils.hf_transformers_utils import get_processor
25
22
 
26
23
  _is_cuda = is_cuda()
@@ -39,12 +36,10 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
39
36
  ) -> None:
40
37
  nn.Module.__init__(self)
41
38
 
42
- config.moe_layer_freq = 1
43
39
  self.config = config
44
40
  vision_utils.update_vit_attn_dummy_heads_config(self.config)
45
41
  self.tp_size = get_tensor_model_parallel_world_size()
46
42
  self.quant_config = quant_config
47
- self.determine_num_fused_shared_experts("Glm4MoeForCausalLM")
48
43
  self.num_fused_shared_experts = (
49
44
  0
50
45
  if get_global_server_args().disable_shared_experts_fusion
@@ -77,38 +72,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
77
72
  # For EAGLE3 support
78
73
  self.capture_aux_hidden_states = False
79
74
 
80
- def determine_num_fused_shared_experts(
81
- self, architecture: str = "Glm4MoeForCausalLM"
82
- ):
83
- self.num_fused_shared_experts = 0
84
- if get_global_server_args().disable_shared_experts_fusion:
85
- return
86
-
87
- # Only Deepseek V3/R1 can use shared experts fusion optimization now.
88
- disable_reason = None
89
- if (
90
- not _is_cuda
91
- or torch.cuda.get_device_capability("cuda") < (8, 0)
92
- or self.config.architectures[0] != architecture
93
- or self.config.n_shared_experts != 1
94
- ):
95
- disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."
96
- elif get_moe_expert_parallel_world_size() > 1:
97
- disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
98
-
99
- if disable_reason is not None:
100
- get_global_server_args().disable_shared_experts_fusion = True
101
- self.num_fused_shared_experts = 0
102
- log_info_on_rank0(
103
- logger,
104
- f"{disable_reason} Shared experts fusion optimization is disabled.",
105
- )
106
- return
107
-
108
- self.num_fused_shared_experts = self.config.n_shared_experts
109
-
110
75
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
111
-
112
76
  if is_nextn:
113
77
  if hasattr(self.config, "num_nextn_predict_layers"):
114
78
  num_nextn_layers = self.config.num_nextn_predict_layers
@@ -130,117 +94,14 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
130
94
  ("gate_up_proj", "gate_proj", 0),
131
95
  ("gate_up_proj", "up_proj", 1),
132
96
  ]
133
- if self.num_fused_shared_experts > 0:
134
- assert self.num_fused_shared_experts == 1
135
- weights_list = list(weights)
136
- weights_dict = dict(weights_list)
137
- if self.quant_config is not None:
138
- if self.quant_config.get_name() == "w8a8_int8":
139
- suffix_list = [
140
- "down_proj.weight",
141
- "down_proj.weight_scale",
142
- "gate_proj.weight",
143
- "gate_proj.weight_scale",
144
- "up_proj.weight",
145
- "up_proj.weight_scale",
146
- ]
147
- elif (
148
- self.quant_config.get_name() == "fp8"
149
- or self.quant_config.get_name() == "blockwise_int8"
150
- or self.quant_config.get_name() == "compressed_tensors"
151
- ):
152
- suffix_list = [
153
- "down_proj.weight",
154
- "down_proj.weight_scale",
155
- "gate_proj.weight",
156
- "gate_proj.weight_scale",
157
- "up_proj.weight",
158
- "up_proj.weight_scale",
159
- ]
160
- elif self.quant_config.get_name() == "awq":
161
- suffix_list = [
162
- "down_proj.qweight",
163
- "down_proj.qzeros",
164
- "down_proj.scales",
165
- "gate_proj.qweight",
166
- "gate_proj.qzeros",
167
- "gate_proj.scales",
168
- "up_proj.qweight",
169
- "up_proj.qzeros",
170
- "up_proj.scales",
171
- ]
172
- elif self.quant_config.get_name() == "modelopt_fp4":
173
- suffix_list = [
174
- "down_proj.weight",
175
- "down_proj.weight_scale",
176
- "down_proj.weight_scale_2",
177
- "down_proj.input_scale",
178
- "gate_proj.weight",
179
- "gate_proj.weight_scale",
180
- "gate_proj.weight_scale_2",
181
- "gate_proj.input_scale",
182
- "up_proj.weight",
183
- "up_proj.weight_scale",
184
- "up_proj.weight_scale_2",
185
- "up_proj.input_scale",
186
- ]
187
- else:
188
- raise ValueError(
189
- f"Unsupported shared expert fusion for quantization: {self.quant_config.get_name()}."
190
- )
191
- else:
192
- suffix_list = [
193
- "down_proj.weight",
194
- "gate_proj.weight",
195
- "up_proj.weight",
196
- ]
197
- names_to_remove = []
198
-
199
- moe_layers = (
200
- range(
201
- self.config.first_k_dense_replace,
202
- self.config.num_hidden_layers,
203
- self.config.moe_layer_freq,
204
- )
205
- if not is_nextn
206
- else [nextn_layer_id]
207
- )
208
97
 
209
- for moe_layer in moe_layers:
210
- for suffix in suffix_list:
211
- shared_expert_weight_name = (
212
- f"model.layers.{moe_layer}.mlp.shared_experts.{suffix}"
213
- )
214
- # online fp8 quantization does not load weight_scale
215
- if shared_expert_weight_name not in weights_dict:
216
- continue
217
- weights_list.append(
218
- (
219
- f"model.layers.{moe_layer}."
220
- f"mlp.experts."
221
- f"{self.config.n_routed_experts + 0}"
222
- f".{suffix}",
223
- weights_dict[shared_expert_weight_name],
224
- )
225
- )
226
- names_to_remove += [shared_expert_weight_name]
227
- weights = [w for w in weights_list if w[0] not in names_to_remove]
228
-
229
- # Params for weights, fp8 weight scales, fp8 activation scales
230
- # (param_name, weight_name, expert_id, shard_id)
231
98
  expert_params_mapping = FusedMoE.make_expert_params_mapping(
232
99
  ckpt_gate_proj_name="gate_proj",
233
100
  ckpt_down_proj_name="down_proj",
234
101
  ckpt_up_proj_name="up_proj",
235
- num_experts=self.config.n_routed_experts + self.num_fused_shared_experts,
102
+ num_experts=self.config.n_routed_experts,
236
103
  )
237
104
 
238
- # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
239
- fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
240
- self.config.q_lora_rank is not None
241
- )
242
- cached_a_proj = {} if fuse_qkv_a_proj else None
243
-
244
105
  if is_nextn:
245
106
  nextn_layer_prefix = f"model.layers.{nextn_layer_id}"
246
107
  nextn_spec_weight_names = [
@@ -300,23 +161,36 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
300
161
  # name will be updated to mlp.experts[0].gate_up_proj, which
301
162
  # will then be updated below in expert_params_mapping
302
163
  # for mlp.experts[0].gate_gate_up_proj, which breaks load.
303
- if ("mlp.experts." in name) and name not in params_dict:
164
+ if "mlp.experts" in name:
304
165
  continue
305
166
  name = name.replace(weight_name, param_name)
306
167
  # Skip loading extra bias for GPTQ models.
307
168
  if name.endswith(".bias") and name not in params_dict:
308
169
  continue
309
- param = params_dict[name]
170
+ if name not in params_dict:
171
+ continue
310
172
 
173
+ param = params_dict[name]
311
174
  weight_loader = param.weight_loader
312
175
  weight_loader(param, loaded_weight, shard_id)
313
176
  break
314
177
  else:
178
+ # Track if this is an expert weight to enable early skipping
179
+ is_expert_weight = False
180
+
315
181
  for mapping in expert_params_mapping:
316
182
  param_name, weight_name, expert_id, shard_id = mapping
317
183
  if weight_name not in name:
318
184
  continue
185
+
186
+ # Mark as expert weight regardless of whether we can process it
187
+ is_expert_weight = True
188
+
319
189
  name = name.replace(weight_name, param_name)
190
+ if name not in params_dict:
191
+ # Expert weight not on this rank, will be skipped below
192
+ continue
193
+
320
194
  param = params_dict[name]
321
195
  weight_loader = param.weight_loader
322
196
  weight_loader(
@@ -328,64 +202,21 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
328
202
  )
329
203
  break
330
204
  else:
205
+ if is_expert_weight:
206
+ # This is an expert weight but not mapped to this rank, skip all remaining processing
207
+ continue
208
+
331
209
  if "visual" in name:
332
- # adapt to VisionAttention
210
+ # adapt to VisionAttention for GLM-V
333
211
  name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
334
212
 
335
213
  # Skip loading extra bias for GPTQ models.
336
214
  if name.endswith(".bias") and name not in params_dict:
337
215
  continue
338
- if fuse_qkv_a_proj and (
339
- "q_a_proj" in name or "kv_a_proj_with_mqa" in name
340
- ):
341
- cached_a_proj[name] = loaded_weight
342
- q_a_proj_name = (
343
- name
344
- if "q_a_proj" in name
345
- else name.replace("kv_a_proj_with_mqa", "q_a_proj")
346
- )
347
- kv_a_proj_name = (
348
- name
349
- if "kv_a_proj_with_mqa" in name
350
- else name.replace("q_a_proj", "kv_a_proj_with_mqa")
351
- )
352
-
353
- # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
354
- if (
355
- q_a_proj_name in cached_a_proj
356
- and kv_a_proj_name in cached_a_proj
357
- ):
358
- q_a_proj_weight = cached_a_proj[q_a_proj_name]
359
- kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
360
- fused_weight = torch.cat(
361
- [q_a_proj_weight, kv_a_proj_weight], dim=0
362
- )
363
- param_name = (
364
- name.replace("q_a_proj", "fused_qkv_a_proj_with_mqa")
365
- if "q_a_proj" in name
366
- else name.replace(
367
- "kv_a_proj_with_mqa", "fused_qkv_a_proj_with_mqa"
368
- )
369
- )
370
- param = params_dict[param_name]
216
+ if name not in params_dict:
217
+ continue
371
218
 
372
- weight_loader = getattr(
373
- param, "weight_loader", default_weight_loader
374
- )
375
- weight_loader(param, fused_weight)
376
- cached_a_proj.pop(q_a_proj_name)
377
- cached_a_proj.pop(kv_a_proj_name)
378
- else:
379
- if (
380
- "k_scale" in name or "v_scale" in name
381
- ) and name not in params_dict:
382
- # modelopt attn kv scale is named differently
383
- if any(scale in name for scale in ["k_scale", "v_scale"]):
384
- name = name.replace("_proj", "attn_mqa")
385
- else:
386
- logger.warning(
387
- f"Unknown scale found in checkpoint: {name}"
388
- )
219
+ if name in params_dict.keys():
389
220
  param = params_dict[name]
390
221
  weight_loader = getattr(
391
222
  param, "weight_loader", default_weight_loader
@@ -395,6 +226,8 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
395
226
  self.config, name, loaded_weight
396
227
  )
397
228
  weight_loader(param, loaded_weight)
229
+ else:
230
+ logger.warning(f"Parameter {name} not found in params_dict")
398
231
 
399
232
 
400
233
  EntryClass = [Glm4vMoeForConditionalGeneration]