sglang 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. sglang/bench_one_batch.py +113 -17
  2. sglang/compile_deep_gemm.py +8 -1
  3. sglang/global_config.py +5 -1
  4. sglang/srt/configs/model_config.py +35 -0
  5. sglang/srt/conversation.py +9 -117
  6. sglang/srt/disaggregation/base/conn.py +5 -2
  7. sglang/srt/disaggregation/decode.py +6 -1
  8. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -0
  9. sglang/srt/disaggregation/mooncake/conn.py +243 -135
  10. sglang/srt/disaggregation/prefill.py +3 -0
  11. sglang/srt/distributed/device_communicators/pynccl.py +7 -0
  12. sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
  13. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
  14. sglang/srt/distributed/parallel_state.py +22 -9
  15. sglang/srt/entrypoints/context.py +244 -0
  16. sglang/srt/entrypoints/engine.py +8 -5
  17. sglang/srt/entrypoints/harmony_utils.py +370 -0
  18. sglang/srt/entrypoints/http_server.py +106 -15
  19. sglang/srt/entrypoints/openai/protocol.py +227 -1
  20. sglang/srt/entrypoints/openai/serving_chat.py +278 -42
  21. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  22. sglang/srt/entrypoints/openai/tool_server.py +174 -0
  23. sglang/srt/entrypoints/tool.py +87 -0
  24. sglang/srt/eplb/expert_distribution.py +4 -2
  25. sglang/srt/eplb/expert_location.py +5 -1
  26. sglang/srt/function_call/harmony_tool_parser.py +130 -0
  27. sglang/srt/hf_transformers_utils.py +55 -13
  28. sglang/srt/jinja_template_utils.py +8 -1
  29. sglang/srt/layers/attention/aiter_backend.py +5 -8
  30. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  31. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  32. sglang/srt/layers/attention/flashattention_backend.py +7 -11
  33. sglang/srt/layers/attention/triton_backend.py +85 -14
  34. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  35. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  36. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  37. sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
  38. sglang/srt/layers/attention/vision.py +40 -15
  39. sglang/srt/layers/communicator.py +35 -8
  40. sglang/srt/layers/dp_attention.py +12 -0
  41. sglang/srt/layers/linear.py +9 -8
  42. sglang/srt/layers/logits_processor.py +9 -1
  43. sglang/srt/layers/moe/cutlass_moe.py +20 -6
  44. sglang/srt/layers/moe/ep_moe/layer.py +87 -107
  45. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  47. sglang/srt/layers/moe/fused_moe_triton/layer.py +442 -58
  48. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +169 -15
  49. sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
  50. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
  51. sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
  52. sglang/srt/layers/moe/topk.py +12 -3
  53. sglang/srt/layers/moe/utils.py +59 -0
  54. sglang/srt/layers/quantization/__init__.py +22 -0
  55. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
  56. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
  57. sglang/srt/layers/quantization/fp4.py +557 -0
  58. sglang/srt/layers/quantization/fp8.py +8 -7
  59. sglang/srt/layers/quantization/fp8_kernel.py +0 -4
  60. sglang/srt/layers/quantization/fp8_utils.py +29 -0
  61. sglang/srt/layers/quantization/modelopt_quant.py +259 -64
  62. sglang/srt/layers/quantization/mxfp4.py +651 -0
  63. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  64. sglang/srt/layers/quantization/quark/__init__.py +0 -0
  65. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  66. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  67. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  68. sglang/srt/layers/quantization/quark/utils.py +107 -0
  69. sglang/srt/layers/quantization/unquant.py +60 -6
  70. sglang/srt/layers/quantization/w4afp8.py +1 -1
  71. sglang/srt/layers/rotary_embedding.py +225 -1
  72. sglang/srt/layers/utils.py +9 -0
  73. sglang/srt/layers/vocab_parallel_embedding.py +15 -4
  74. sglang/srt/lora/lora_manager.py +70 -14
  75. sglang/srt/lora/lora_registry.py +10 -2
  76. sglang/srt/lora/mem_pool.py +43 -5
  77. sglang/srt/managers/cache_controller.py +61 -32
  78. sglang/srt/managers/data_parallel_controller.py +52 -2
  79. sglang/srt/managers/detokenizer_manager.py +1 -1
  80. sglang/srt/managers/io_struct.py +21 -4
  81. sglang/srt/managers/mm_utils.py +5 -11
  82. sglang/srt/managers/schedule_batch.py +30 -8
  83. sglang/srt/managers/schedule_policy.py +3 -1
  84. sglang/srt/managers/scheduler.py +170 -18
  85. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  86. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  87. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  88. sglang/srt/managers/template_manager.py +59 -22
  89. sglang/srt/managers/tokenizer_manager.py +137 -67
  90. sglang/srt/managers/tp_worker.py +3 -0
  91. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  92. sglang/srt/managers/utils.py +45 -1
  93. sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
  94. sglang/srt/mem_cache/hicache_storage.py +13 -21
  95. sglang/srt/mem_cache/hiradix_cache.py +53 -5
  96. sglang/srt/mem_cache/memory_pool_host.py +1 -1
  97. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  98. sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
  99. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  100. sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
  101. sglang/srt/model_executor/cuda_graph_runner.py +24 -9
  102. sglang/srt/model_executor/forward_batch_info.py +48 -17
  103. sglang/srt/model_executor/model_runner.py +24 -2
  104. sglang/srt/model_loader/weight_utils.py +10 -0
  105. sglang/srt/models/bailing_moe.py +425 -0
  106. sglang/srt/models/deepseek_v2.py +95 -50
  107. sglang/srt/models/ernie4.py +426 -0
  108. sglang/srt/models/ernie4_eagle.py +203 -0
  109. sglang/srt/models/gemma3n_mm.py +39 -0
  110. sglang/srt/models/glm4_moe.py +102 -27
  111. sglang/srt/models/gpt_oss.py +1134 -0
  112. sglang/srt/models/grok.py +3 -3
  113. sglang/srt/models/llama4.py +13 -2
  114. sglang/srt/models/mixtral.py +3 -3
  115. sglang/srt/models/mllama4.py +428 -19
  116. sglang/srt/models/qwen2.py +6 -0
  117. sglang/srt/models/qwen2_moe.py +7 -4
  118. sglang/srt/models/qwen3_moe.py +39 -14
  119. sglang/srt/models/step3_vl.py +10 -1
  120. sglang/srt/models/transformers.py +2 -5
  121. sglang/srt/multimodal/processors/base_processor.py +4 -3
  122. sglang/srt/multimodal/processors/gemma3n.py +0 -7
  123. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  124. sglang/srt/operations_strategy.py +1 -1
  125. sglang/srt/reasoning_parser.py +18 -39
  126. sglang/srt/server_args.py +218 -23
  127. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
  128. sglang/srt/two_batch_overlap.py +163 -9
  129. sglang/srt/utils.py +41 -26
  130. sglang/srt/weight_sync/utils.py +1 -1
  131. sglang/test/runners.py +4 -4
  132. sglang/test/test_utils.py +4 -4
  133. sglang/version.py +1 -1
  134. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +18 -15
  135. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +143 -116
  136. /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
  137. /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
  138. /sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
  139. /sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
  140. /sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
  141. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
  142. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
  143. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
@@ -148,7 +148,6 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
148
148
  **(
149
149
  dict(
150
150
  enable_flashinfer_cutlass_moe=True,
151
- enable_ep_moe=global_server_args_dict["enable_ep_moe"],
152
151
  )
153
152
  if global_server_args_dict["enable_flashinfer_cutlass_moe"]
154
153
  else {}
@@ -211,6 +210,7 @@ class Qwen2MoeAttention(nn.Module):
211
210
  max_position_embeddings: int = 8192,
212
211
  qkv_bias: int = True,
213
212
  quant_config: Optional[QuantizationConfig] = None,
213
+ dual_chunk_attention_config: Optional[dict[str, Any]] = None,
214
214
  prefix: str = "",
215
215
  ) -> None:
216
216
  super().__init__()
@@ -268,6 +268,7 @@ class Qwen2MoeAttention(nn.Module):
268
268
  max_position=max_position_embeddings,
269
269
  base=rope_theta,
270
270
  rope_scaling=rope_scaling,
271
+ dual_chunk_attention_config=dual_chunk_attention_config,
271
272
  )
272
273
  self.attn = RadixAttention(
273
274
  self.num_heads,
@@ -309,6 +310,9 @@ class Qwen2MoeDecoderLayer(nn.Module):
309
310
  rope_scaling = getattr(config, "rope_scaling", None)
310
311
  max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
311
312
  qkv_bias = getattr(config, "qkv_bias", True)
313
+ dual_chunk_attention_config = getattr(
314
+ config, "dual_chunk_attention_config", None
315
+ )
312
316
  self.self_attn = Qwen2MoeAttention(
313
317
  hidden_size=self.hidden_size,
314
318
  num_heads=config.num_attention_heads,
@@ -318,6 +322,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
318
322
  rope_scaling=rope_scaling,
319
323
  max_position_embeddings=max_position_embeddings,
320
324
  quant_config=quant_config,
325
+ dual_chunk_attention_config=dual_chunk_attention_config,
321
326
  qkv_bias=qkv_bias,
322
327
  prefix=add_prefix("self_attn", prefix),
323
328
  )
@@ -616,9 +621,7 @@ class Qwen2MoeForCausalLM(nn.Module):
616
621
  ("gate_up_proj", "up_proj", 1),
617
622
  ]
618
623
 
619
- MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
620
-
621
- expert_params_mapping = MoEImpl.make_expert_params_mapping(
624
+ expert_params_mapping = FusedMoE.make_expert_params_mapping(
622
625
  ckpt_gate_proj_name="gate_proj",
623
626
  ckpt_down_proj_name="down_proj",
624
627
  ckpt_up_proj_name="up_proj",
@@ -24,6 +24,7 @@ import torch
24
24
  from torch import nn
25
25
 
26
26
  from sglang.srt.distributed import (
27
+ get_moe_expert_parallel_world_size,
27
28
  get_pp_group,
28
29
  get_tensor_model_parallel_rank,
29
30
  get_tensor_model_parallel_world_size,
@@ -51,7 +52,6 @@ from sglang.srt.layers.linear import (
51
52
  )
52
53
  from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
53
54
  from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
54
- from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
55
55
  from sglang.srt.layers.moe.topk import TopK
56
56
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
57
57
  from sglang.srt.layers.radix_attention import RadixAttention
@@ -72,7 +72,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
72
72
  from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP
73
73
  from sglang.srt.models.qwen2_moe import Qwen2MoeModel
74
74
  from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
75
- from sglang.srt.utils import DeepEPMode, add_prefix, is_cuda, is_non_idle_and_non_empty
75
+ from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty
76
76
 
77
77
  Qwen3MoeConfig = None
78
78
 
@@ -113,15 +113,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
113
113
  quant_config=quant_config,
114
114
  prefix=add_prefix("experts", prefix),
115
115
  **(
116
- dict(deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]])
117
- if global_server_args_dict["enable_deepep_moe"]
116
+ dict(deepep_mode=global_server_args_dict["deepep_mode"])
117
+ if global_server_args_dict["moe_a2a_backend"].is_deepep()
118
118
  else {}
119
119
  ),
120
120
  # Additional args for FusedMoE
121
121
  **(
122
122
  dict(
123
123
  enable_flashinfer_cutlass_moe=True,
124
- enable_ep_moe=global_server_args_dict["enable_ep_moe"],
125
124
  )
126
125
  if global_server_args_dict["enable_flashinfer_cutlass_moe"]
127
126
  else {}
@@ -136,9 +135,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
136
135
  prefix=add_prefix("gate", prefix),
137
136
  )
138
137
 
139
- if global_server_args_dict["enable_deepep_moe"]:
138
+ if global_server_args_dict["moe_a2a_backend"].is_deepep():
140
139
  # TODO: we will support tp < ep in the future
141
- self.ep_size = get_tensor_model_parallel_world_size()
140
+ self.ep_size = get_moe_expert_parallel_world_size()
142
141
  self.num_experts = (
143
142
  config.num_experts + global_server_args_dict["ep_num_redundant_experts"]
144
143
  )
@@ -148,7 +147,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
148
147
  self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
149
148
  ) -> torch.Tensor:
150
149
 
151
- if not global_server_args_dict["enable_deepep_moe"]:
150
+ if not global_server_args_dict["moe_a2a_backend"].is_deepep():
152
151
  return self.forward_normal(hidden_states)
153
152
  else:
154
153
  return self.forward_deepep(hidden_states, forward_batch)
@@ -296,6 +295,7 @@ class Qwen3MoeAttention(nn.Module):
296
295
  attention_bias: bool = False,
297
296
  quant_config: Optional[QuantizationConfig] = None,
298
297
  prefix: str = "",
298
+ dual_chunk_attention_config: Optional[dict[str, Any]] = None,
299
299
  alt_stream: Optional[torch.cuda.Stream] = None,
300
300
  ) -> None:
301
301
  super().__init__()
@@ -354,6 +354,7 @@ class Qwen3MoeAttention(nn.Module):
354
354
  max_position=max_position_embeddings,
355
355
  base=rope_theta,
356
356
  rope_scaling=rope_scaling,
357
+ dual_chunk_attention_config=dual_chunk_attention_config,
357
358
  )
358
359
  self.attn = RadixAttention(
359
360
  self.num_heads,
@@ -459,6 +460,9 @@ class Qwen3MoeDecoderLayer(nn.Module):
459
460
  )
460
461
  rms_norm_eps = config.rms_norm_eps
461
462
  attention_bias = config.attention_bias
463
+ dual_chunk_attention_config = getattr(
464
+ config, "dual_chunk_attention_config", None
465
+ )
462
466
  self.self_attn = Qwen3MoeAttention(
463
467
  hidden_size=self.hidden_size,
464
468
  num_heads=config.num_attention_heads,
@@ -472,6 +476,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
472
476
  attention_bias=attention_bias,
473
477
  quant_config=quant_config,
474
478
  prefix=add_prefix("self_attn", prefix),
479
+ dual_chunk_attention_config=dual_chunk_attention_config,
475
480
  alt_stream=alt_stream,
476
481
  )
477
482
 
@@ -767,7 +772,10 @@ class Qwen3MoeForCausalLM(nn.Module):
767
772
  num_experts=self.config.num_experts,
768
773
  )
769
774
 
770
- params_dict = dict(self.named_parameters())
775
+ # Cache params_dict to avoid repeated expensive traversal of model parameters
776
+ if not hasattr(self, "_cached_params_dict"):
777
+ self._cached_params_dict = dict(self.named_parameters())
778
+ params_dict = self._cached_params_dict
771
779
  for name, loaded_weight in weights:
772
780
  layer_id = get_layer_id(name)
773
781
  if (
@@ -806,11 +814,22 @@ class Qwen3MoeForCausalLM(nn.Module):
806
814
  weight_loader(param, loaded_weight, shard_id)
807
815
  break
808
816
  else:
817
+ # Track if this is an expert weight to enable early skipping
818
+ is_expert_weight = False
819
+
809
820
  for mapping in expert_params_mapping:
810
821
  param_name, weight_name, expert_id, shard_id = mapping
811
822
  if weight_name not in name:
812
823
  continue
824
+
825
+ # Mark as expert weight regardless of whether we can process it
826
+ is_expert_weight = True
827
+
813
828
  name = name.replace(weight_name, param_name)
829
+ if name not in params_dict:
830
+ # Expert weight not on this rank, will be skipped below
831
+ continue
832
+
814
833
  param = params_dict[name]
815
834
  weight_loader = param.weight_loader
816
835
  weight_loader(
@@ -822,6 +841,10 @@ class Qwen3MoeForCausalLM(nn.Module):
822
841
  )
823
842
  break
824
843
  else:
844
+ if is_expert_weight:
845
+ # This is an expert weight but not mapped to this rank, skip all remaining processing
846
+ continue
847
+
825
848
  # Skip loading extra bias for GPTQ models.
826
849
  if name.endswith(".bias") and name not in params_dict:
827
850
  continue
@@ -838,11 +861,13 @@ class Qwen3MoeForCausalLM(nn.Module):
838
861
  logger.warning(f"Parameter {name} not found in params_dict")
839
862
 
840
863
  # TODO mimic deepseek
841
- self.routed_experts_weights_of_layer = {
842
- layer_id: self.model.layers[layer_id].mlp.get_moe_weights()
843
- for layer_id in range(self.start_layer, self.end_layer)
844
- if isinstance(self.model.layers[layer_id].mlp, Qwen3MoeSparseMoeBlock)
845
- }
864
+ # Lazy initialization of expert weights cache to avoid slowing down load_weights
865
+ if not hasattr(self, "routed_experts_weights_of_layer"):
866
+ self.routed_experts_weights_of_layer = {
867
+ layer_id: self.model.layers[layer_id].mlp.get_moe_weights()
868
+ for layer_id in range(self.start_layer, self.end_layer)
869
+ if isinstance(self.model.layers[layer_id].mlp, Qwen3MoeSparseMoeBlock)
870
+ }
846
871
 
847
872
  @classmethod
848
873
  def get_model_config_for_expert_location(cls, config):
@@ -146,7 +146,7 @@ class Step3TextMoEMLP(nn.Module):
146
146
  prefix=add_prefix("gate", prefix),
147
147
  )
148
148
 
149
- if global_server_args_dict["enable_deepep_moe"]:
149
+ if global_server_args_dict["moe_a2a_backend"].is_deepep():
150
150
  raise NotImplementedError("DeepEP MoE is not supported yet in Step3 model.")
151
151
 
152
152
  def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -531,11 +531,18 @@ class Step3VisionMLP(nn.Module):
531
531
  prefix: str = "",
532
532
  ) -> None:
533
533
  super().__init__()
534
+ # Since this is a dense model,
535
+ # the MLP component likewise adopts a DP-MLP approach modeled after DP Attention.
536
+ # This choice may not represent the optimal solution and remains open to further deliberation.
537
+ attn_tp_rank = get_attention_tp_rank()
538
+ attn_tp_size = get_attention_tp_size()
534
539
  self.fc1 = ColumnParallelLinear(
535
540
  dim,
536
541
  intermediate_size,
537
542
  bias=bias,
538
543
  quant_config=quant_config,
544
+ tp_rank=attn_tp_rank,
545
+ tp_size=attn_tp_size,
539
546
  prefix=add_prefix("gate_proj", prefix),
540
547
  )
541
548
  self.act = ACT2FN[hidden_act] # quick_gelu
@@ -544,6 +551,8 @@ class Step3VisionMLP(nn.Module):
544
551
  dim,
545
552
  bias=bias,
546
553
  quant_config=quant_config,
554
+ tp_rank=attn_tp_rank,
555
+ tp_size=attn_tp_size,
547
556
  prefix=add_prefix("down_proj", prefix),
548
557
  )
549
558
 
@@ -211,16 +211,13 @@ class TransformersForCausalLM(nn.Module):
211
211
  Apply the model's tensor parallelization plan.
212
212
  Currently only supports linear layers.
213
213
  """
214
- if not self.model.supports_tp_plan:
215
- if tp_size <= 1:
216
- return
214
+ tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {}
217
215
 
216
+ if not tp_plan and self.tp_size > 1:
218
217
  raise ValueError(
219
218
  f"{type(self.model)} does not support tensor parallel yet!"
220
219
  )
221
220
 
222
- tp_plan = self.model._tp_plan
223
-
224
221
  def _tensor_parallel(module: nn.Module, prefix: str = ""):
225
222
  for child_name, child_module in module.named_children():
226
223
  qual_name = maybe_prefix(prefix, child_name)
@@ -12,7 +12,6 @@ import torch
12
12
  from PIL import Image
13
13
  from transformers import BaseImageProcessorFast
14
14
 
15
- from sglang.srt.managers.mm_utils import TransportProxyTensor
16
15
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
17
16
  from sglang.srt.utils import load_audio, load_image, load_video, logger
18
17
 
@@ -218,8 +217,10 @@ class BaseMultimodalProcessor(ABC):
218
217
  kwargs["audio"] = audios
219
218
 
220
219
  processor = self._processor
221
- if hasattr(processor, "image_processor") and isinstance(
222
- processor.image_processor, BaseImageProcessorFast
220
+ if (
221
+ hasattr(processor, "image_processor")
222
+ and isinstance(processor.image_processor, BaseImageProcessorFast)
223
+ and not self.server_args.disable_fast_image_processor
223
224
  ):
224
225
  kwargs["device"] = "cuda"
225
226
  result = processor.__call__(
@@ -12,7 +12,6 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
 
15
- import re
16
15
  from typing import Dict, List, Optional, Union
17
16
 
18
17
  from sglang.srt.managers.multimodal_processor import (
@@ -38,14 +37,8 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
38
37
  self.mm_tokens = MultimodalSpecialTokens(
39
38
  image_token="<image_soft_token>",
40
39
  image_token_id=hf_config.image_token_id,
41
- image_token_regex=re.compile(
42
- r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
43
- ),
44
40
  audio_token="<audio_soft_token>",
45
41
  audio_token_id=hf_config.audio_token_id,
46
- audio_token_regex=re.compile(
47
- r"<start_of_audio>(?:(?:<audio_soft_token>)*<end_of_audio>)?"
48
- ),
49
42
  ).build(_processor)
50
43
 
51
44
  async def process_mm_data_async(
@@ -8,7 +8,7 @@ import torch
8
8
  from PIL import Image
9
9
  from torchvision import transforms
10
10
  from torchvision.transforms import InterpolationMode
11
- from transformers import BatchFeature, TensorType
11
+ from transformers import BatchFeature, ProcessorMixin, TensorType
12
12
 
13
13
  from sglang.srt.models.step3_vl import Step3VLForConditionalGeneration
14
14
  from sglang.srt.multimodal.processors.base_processor import (
@@ -276,6 +276,8 @@ class Step3VLProcessor:
276
276
  super().__init__()
277
277
 
278
278
  self.config = config
279
+ if isinstance(tokenizer, ProcessorMixin):
280
+ tokenizer = tokenizer.tokenizer
279
281
  self.tokenizer = tokenizer
280
282
 
281
283
  self.image_size = 728
@@ -4,7 +4,7 @@ from typing import List, Optional
4
4
  import torch
5
5
 
6
6
  from sglang.srt import operations
7
- from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPConfig
7
+ from sglang.srt.layers.moe.token_dispatcher import DeepEPConfig
8
8
  from sglang.srt.model_executor.forward_batch_info import ForwardMode
9
9
  from sglang.srt.operations import Operation
10
10
 
@@ -131,7 +131,7 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
131
131
  If True, streams reasoning content as it arrives.
132
132
  """
133
133
 
134
- def __init__(self, stream_reasoning: bool = True):
134
+ def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
135
135
  # DeepSeek-R1 is assumed to be reasoning until `</think>` token
136
136
  super().__init__(
137
137
  "<think>",
@@ -144,7 +144,7 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
144
144
 
145
145
  class Qwen3Detector(BaseReasoningFormatDetector):
146
146
  """
147
- Detector for standard Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
147
+ Detector for Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
148
148
  Assumes reasoning format:
149
149
  (<think>)*(.*)</think>
150
150
 
@@ -153,47 +153,16 @@ class Qwen3Detector(BaseReasoningFormatDetector):
153
153
  - enable_thinking=True: "<think>reasoning content</think>The answer is 42."
154
154
  - enable_thinking=False: "The answer is 42." (no thinking tokens)
155
155
 
156
- This detector handles both cases.
157
-
158
- NOTE: Do NOT use this detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
159
- Those models always generate thinking content without <think> start tags.
160
- Use "qwen3-thinking" parser type for those models instead.
161
-
162
- Args:
163
- stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
164
- If True, streams reasoning content as it arrives.
165
- """
166
-
167
- def __init__(self, stream_reasoning: bool = True):
168
- super().__init__(
169
- "<think>",
170
- "</think>",
171
- force_reasoning=False,
172
- stream_reasoning=stream_reasoning,
173
- )
174
-
175
-
176
- class Qwen3ThinkingDetector(BaseReasoningFormatDetector):
177
- """
178
- Detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
179
- Assumes reasoning format:
180
- *(.*)</think>
181
-
182
- These models always generate thinking content without <think> start tag.
183
- They do not support the enable_thinking parameter and always think.
184
-
185
- Format: "I need to think about this...</think>The answer is 42."
186
-
187
156
  Args:
188
157
  stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
189
158
  If True, streams reasoning content as it arrives.
190
159
  """
191
160
 
192
- def __init__(self, stream_reasoning: bool = True):
161
+ def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
193
162
  super().__init__(
194
163
  "<think>",
195
164
  "</think>",
196
- force_reasoning=True,
165
+ force_reasoning=force_reasoning,
197
166
  stream_reasoning=stream_reasoning,
198
167
  )
199
168
 
@@ -207,7 +176,7 @@ class KimiDetector(BaseReasoningFormatDetector):
207
176
  and the rest of the text as `normal_text`.
208
177
  """
209
178
 
210
- def __init__(self, stream_reasoning: bool = True):
179
+ def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
211
180
  super().__init__(
212
181
  "◁think▷",
213
182
  "◁/think▷",
@@ -230,13 +199,18 @@ class ReasoningParser:
230
199
  DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
231
200
  "deepseek-r1": DeepSeekR1Detector,
232
201
  "qwen3": Qwen3Detector,
233
- "qwen3-thinking": Qwen3ThinkingDetector,
202
+ "qwen3-thinking": Qwen3Detector,
234
203
  "glm45": Qwen3Detector,
235
204
  "kimi": KimiDetector,
236
205
  "step3": DeepSeekR1Detector,
237
206
  }
238
207
 
239
- def __init__(self, model_type: Optional[str] = None, stream_reasoning: bool = True):
208
+ def __init__(
209
+ self,
210
+ model_type: Optional[str] = None,
211
+ stream_reasoning: bool = True,
212
+ force_reasoning: bool = False,
213
+ ):
240
214
  if not model_type:
241
215
  raise ValueError("Model type must be specified")
242
216
 
@@ -244,7 +218,12 @@ class ReasoningParser:
244
218
  if not detector_class:
245
219
  raise ValueError(f"Unsupported model type: {model_type}")
246
220
 
247
- self.detector = detector_class(stream_reasoning=stream_reasoning)
221
+ if model_type.lower() == "qwen3-thinking":
222
+ force_reasoning = True
223
+
224
+ self.detector = detector_class(
225
+ stream_reasoning=stream_reasoning, force_reasoning=force_reasoning
226
+ )
248
227
 
249
228
  def parse_non_stream(self, full_text: str) -> Tuple[str, str]:
250
229
  """Non-streaming call: one-time parsing"""