sglang 0.4.9.post5__py3-none-any.whl → 0.4.9.post6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. sglang/srt/configs/model_config.py +3 -0
  2. sglang/srt/entrypoints/http_server.py +13 -1
  3. sglang/srt/entrypoints/openai/protocol.py +3 -1
  4. sglang/srt/entrypoints/openai/serving_base.py +5 -2
  5. sglang/srt/layers/moe/ep_moe/layer.py +152 -37
  6. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
  7. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  8. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  9. sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
  10. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
  11. sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
  12. sglang/srt/layers/moe/topk.py +6 -2
  13. sglang/srt/layers/quantization/modelopt_quant.py +2 -0
  14. sglang/srt/managers/data_parallel_controller.py +4 -0
  15. sglang/srt/managers/io_struct.py +12 -0
  16. sglang/srt/managers/scheduler.py +29 -0
  17. sglang/srt/managers/scheduler_input_blocker.py +106 -0
  18. sglang/srt/managers/tokenizer_manager.py +43 -9
  19. sglang/srt/managers/tp_worker.py +5 -0
  20. sglang/srt/model_executor/model_runner.py +15 -13
  21. sglang/srt/models/deepseek_v2.py +13 -56
  22. sglang/srt/models/qwen3_moe.py +12 -69
  23. sglang/srt/poll_based_barrier.py +31 -0
  24. sglang/srt/server_args.py +8 -0
  25. sglang/srt/two_batch_overlap.py +8 -3
  26. sglang/test/test_utils.py +53 -0
  27. sglang/version.py +1 -1
  28. {sglang-0.4.9.post5.dist-info → sglang-0.4.9.post6.dist-info}/METADATA +2 -1
  29. {sglang-0.4.9.post5.dist-info → sglang-0.4.9.post6.dist-info}/RECORD +32 -25
  30. {sglang-0.4.9.post5.dist-info → sglang-0.4.9.post6.dist-info}/WHEEL +0 -0
  31. {sglang-0.4.9.post5.dist-info → sglang-0.4.9.post6.dist-info}/licenses/LICENSE +0 -0
  32. {sglang-0.4.9.post5.dist-info → sglang-0.4.9.post6.dist-info}/top_level.txt +0 -0
@@ -27,6 +27,7 @@ import threading
27
27
  import time
28
28
  import uuid
29
29
  from collections import deque
30
+ from contextlib import nullcontext
30
31
  from datetime import datetime
31
32
  from http import HTTPStatus
32
33
  from typing import (
@@ -69,6 +70,7 @@ from sglang.srt.managers.io_struct import (
69
70
  BatchMultimodalOut,
70
71
  BatchStrOut,
71
72
  BatchTokenIDOut,
73
+ BlockReqType,
72
74
  CloseSessionReqInput,
73
75
  ConfigureLoggingReq,
74
76
  EmbeddingReqInput,
@@ -114,6 +116,7 @@ from sglang.srt.managers.io_struct import (
114
116
  )
115
117
  from sglang.srt.managers.mm_utils import TensorTransportMode
116
118
  from sglang.srt.managers.multimodal_processor import get_mm_processor, import_processors
119
+ from sglang.srt.managers.scheduler_input_blocker import input_blocker_guard_region
117
120
  from sglang.srt.metrics.collector import TokenizerMetricsCollector
118
121
  from sglang.srt.sampling.sampling_params import SamplingParams
119
122
  from sglang.srt.server_args import PortArgs, ServerArgs
@@ -766,6 +769,19 @@ class TokenizerManager:
766
769
  ):
767
770
  raise ValueError(finish_reason["message"])
768
771
 
772
+ if (
773
+ finish_reason.get("type") == "abort"
774
+ and finish_reason.get("status_code")
775
+ == HTTPStatus.SERVICE_UNAVAILABLE
776
+ ):
777
+ # This is an abort request initiated by scheduler.
778
+ # Delete the key to prevent resending abort request to the scheduler and
779
+ # to ensure aborted request state is cleaned up.
780
+ del self.rid_to_state[state.obj.rid]
781
+ raise fastapi.HTTPException(
782
+ status_code=finish_reason["status_code"],
783
+ detail=finish_reason["message"],
784
+ )
769
785
  yield out
770
786
  break
771
787
 
@@ -806,12 +822,21 @@ class TokenizerManager:
806
822
  rids.append(tmp_obj.rid)
807
823
  else:
808
824
  # Sequential tokenization and processing
809
- for i in range(batch_size):
810
- tmp_obj = obj[i]
811
- tokenized_obj = await self._tokenize_one_request(tmp_obj)
812
- state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
813
- generators.append(self._wait_one_response(tmp_obj, state, request))
814
- rids.append(tmp_obj.rid)
825
+ with (
826
+ input_blocker_guard_region(send_to_scheduler=self.send_to_scheduler)
827
+ if get_bool_env_var("SGLANG_ENABLE_COLOCATED_BATCH_GEN")
828
+ else nullcontext()
829
+ ):
830
+ for i in range(batch_size):
831
+ tmp_obj = obj[i]
832
+ tokenized_obj = await self._tokenize_one_request(tmp_obj)
833
+ state = self._send_one_request(
834
+ tmp_obj, tokenized_obj, created_time
835
+ )
836
+ generators.append(
837
+ self._wait_one_response(tmp_obj, state, request)
838
+ )
839
+ rids.append(tmp_obj.rid)
815
840
  else:
816
841
  # FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
817
842
  if batch_size > 128:
@@ -1705,8 +1730,15 @@ class TokenizerManager:
1705
1730
  def _handle_abort_req(self, recv_obj):
1706
1731
  state = self.rid_to_state[recv_obj.rid]
1707
1732
  state.finished = True
1708
- state.out_list.append(
1709
- {
1733
+ if recv_obj.finished_reason:
1734
+ out = {
1735
+ "meta_info": {
1736
+ "id": recv_obj.rid,
1737
+ "finish_reason": recv_obj.finished_reason,
1738
+ },
1739
+ }
1740
+ else:
1741
+ out = {
1710
1742
  "text": "",
1711
1743
  "meta_info": {
1712
1744
  "id": recv_obj.rid,
@@ -1718,7 +1750,7 @@ class TokenizerManager:
1718
1750
  "completion_tokens": 0,
1719
1751
  },
1720
1752
  }
1721
- )
1753
+ state.out_list.append(out)
1722
1754
  state.event.set()
1723
1755
 
1724
1756
  def _handle_open_session_req_output(self, recv_obj):
@@ -1910,8 +1942,10 @@ class _Communicator(Generic[T]):
1910
1942
  #
1911
1943
  # | entrypoint | is_streaming | status | abort engine | cancel asyncio task | rid_to_state |
1912
1944
  # | ---------- | ------------ | --------------- | --------------- | --------------------- | --------------------------- |
1945
+ # | http | yes | validation | background task | fast api | del in _handle_abort_req |
1913
1946
  # | http | yes | waiting queue | background task | fast api | del in _handle_abort_req |
1914
1947
  # | http | yes | running | background task | fast api | del in _handle_batch_output |
1948
+ # | http | no | validation | http exception | http exception | del in _handle_abort_req |
1915
1949
  # | http | no | waiting queue | type 1 | type 1 exception | del in _handle_abort_req |
1916
1950
  # | http | no | running | type 3 | type 3 exception | del in _handle_batch_output |
1917
1951
  #
@@ -130,6 +130,10 @@ class TpModelWorker:
130
130
  self.model_runner.req_to_token_pool.size,
131
131
  )
132
132
  assert self.max_running_requests > 0, "max_running_request is zero"
133
+ self.max_queued_requests = server_args.max_queued_requests
134
+ assert (
135
+ self.max_running_requests > 0
136
+ ), "max_queued_requests is zero. We need to be at least 1 to schedule a request."
133
137
  self.max_req_len = min(
134
138
  self.model_config.context_len - 1,
135
139
  self.max_total_num_tokens - 1,
@@ -165,6 +169,7 @@ class TpModelWorker:
165
169
  self.max_total_num_tokens,
166
170
  self.max_prefill_tokens,
167
171
  self.max_running_requests,
172
+ self.max_queued_requests,
168
173
  self.max_req_len,
169
174
  self.max_req_input_len,
170
175
  self.random_seed,
@@ -285,11 +285,21 @@ class ModelRunner:
285
285
  if architectures and not any("Llama4" in arch for arch in architectures):
286
286
  self.is_hybrid = self.model_config.is_hybrid = True
287
287
 
288
- self.start_layer = getattr(self.model, "start_layer", 0)
289
- self.end_layer = getattr(
290
- self.model, "end_layer", self.model_config.num_hidden_layers
288
+ # For MTP models like DeepSeek-V3 or GLM-4.5, the MTP layer(s) are used separately as draft
289
+ # models for speculative decoding. In those cases, `num_nextn_predict_layers` is used to
290
+ # determine the number of layers.
291
+ model_has_mtp_layers = self.model_config.num_nextn_predict_layers is not None
292
+ model_num_layers = (
293
+ self.model_config.num_nextn_predict_layers
294
+ if self.is_draft_worker and model_has_mtp_layers
295
+ else self.model_config.num_hidden_layers
291
296
  )
297
+ self.start_layer = getattr(self.model, "start_layer", 0)
298
+ self.end_layer = getattr(self.model, "end_layer", model_num_layers)
292
299
  self.num_effective_layers = self.end_layer - self.start_layer
300
+ assert (not model_has_mtp_layers) or (
301
+ self.num_effective_layers == model_num_layers
302
+ ), "PP is not compatible with MTP models."
293
303
 
294
304
  # Apply torchao quantization
295
305
  torchao_applied = getattr(self.model, "torchao_applied", False)
@@ -1178,11 +1188,7 @@ class ModelRunner:
1178
1188
  dtype=self.kv_cache_dtype,
1179
1189
  kv_lora_rank=self.model_config.kv_lora_rank,
1180
1190
  qk_rope_head_dim=self.model_config.qk_rope_head_dim,
1181
- layer_num=(
1182
- self.model_config.num_hidden_layers
1183
- if not self.is_draft_worker
1184
- else self.model_config.hf_config.num_nextn_predict_layers
1185
- ), # PP is not compatible with mla backend
1191
+ layer_num=self.num_effective_layers,
1186
1192
  device=self.device,
1187
1193
  enable_memory_saver=self.server_args.enable_memory_saver,
1188
1194
  start_layer=self.start_layer,
@@ -1195,11 +1201,7 @@ class ModelRunner:
1195
1201
  dtype=self.kv_cache_dtype,
1196
1202
  kv_lora_rank=self.model_config.kv_lora_rank,
1197
1203
  qk_rope_head_dim=self.model_config.qk_rope_head_dim,
1198
- layer_num=(
1199
- self.model_config.num_hidden_layers
1200
- if not self.is_draft_worker
1201
- else self.model_config.hf_config.num_nextn_predict_layers
1202
- ), # PP is not compatible with mla backend
1204
+ layer_num=self.num_effective_layers,
1203
1205
  device=self.device,
1204
1206
  enable_memory_saver=self.server_args.enable_memory_saver,
1205
1207
  start_layer=self.start_layer,
@@ -594,41 +594,13 @@ class DeepseekV2MoE(nn.Module):
594
594
  topk_weights = torch.empty(
595
595
  (0, self.top_k), dtype=torch.float32, device=hidden_states.device
596
596
  )
597
- if self.ep_size > 1:
598
- # TODO(ch-wan): allow users to set num_max_dispatch_tokens_per_rank value
599
- (
600
- hidden_states,
601
- topk_idx,
602
- topk_weights,
603
- reorder_topk_ids,
604
- num_recv_tokens_per_expert,
605
- seg_indptr,
606
- masked_m,
607
- expected_m,
608
- ) = self.deepep_dispatcher.dispatch(
609
- hidden_states=hidden_states,
610
- topk_idx=topk_idx,
611
- topk_weights=topk_weights,
612
- forward_batch=forward_batch,
613
- )
597
+
614
598
  final_hidden_states = self.experts(
615
599
  hidden_states=hidden_states,
616
600
  topk_idx=topk_idx,
617
601
  topk_weights=topk_weights,
618
- reorder_topk_ids=reorder_topk_ids,
619
- seg_indptr=seg_indptr,
620
- masked_m=masked_m,
621
- expected_m=expected_m,
622
- num_recv_tokens_per_expert=num_recv_tokens_per_expert,
623
602
  forward_batch=forward_batch,
624
603
  )
625
- if self.ep_size > 1:
626
- final_hidden_states = self.deepep_dispatcher.combine(
627
- hidden_states=final_hidden_states,
628
- topk_idx=topk_idx,
629
- topk_weights=topk_weights,
630
- forward_batch=forward_batch,
631
- )
632
604
 
633
605
  if shared_output is not None:
634
606
  x = shared_output
@@ -689,8 +661,7 @@ class DeepseekV2MoE(nn.Module):
689
661
 
690
662
  def op_dispatch_a(self, state):
691
663
  if self.ep_size > 1:
692
- # TODO(ch-wan): allow users to set num_max_dispatch_tokens_per_rank value
693
- self.deepep_dispatcher.dispatch_a(
664
+ self.experts.deepep_dispatcher.dispatch_a(
694
665
  hidden_states=state.hidden_states_mlp_input,
695
666
  topk_idx=state.pop("topk_idx_local"),
696
667
  topk_weights=state.pop("topk_weights_local"),
@@ -703,46 +674,32 @@ class DeepseekV2MoE(nn.Module):
703
674
  with get_global_expert_distribution_recorder().with_current_layer(
704
675
  self.layer_id
705
676
  ):
706
- (
707
- state.hidden_states_experts_input,
708
- state.topk_idx_dispatched,
709
- state.topk_weights_dispatched,
710
- state.reorder_topk_ids,
711
- state.num_recv_tokens_per_expert,
712
- state.seg_indptr,
713
- state.masked_m,
714
- state.expected_m,
715
- ) = self.deepep_dispatcher.dispatch_b(
677
+ state.dispatch_output = self.experts.deepep_dispatcher.dispatch_b(
716
678
  tbo_subbatch_index=state.get("tbo_subbatch_index"),
717
679
  )
718
680
 
719
681
  def op_experts(self, state):
720
- state.hidden_states_experts_output = self.experts(
721
- hidden_states=state.pop("hidden_states_experts_input"),
722
- topk_idx=state.topk_idx_dispatched,
723
- topk_weights=state.topk_weights_dispatched,
724
- reorder_topk_ids=state.pop("reorder_topk_ids"),
725
- seg_indptr=state.pop("seg_indptr"),
726
- masked_m=state.pop("masked_m"),
727
- expected_m=state.pop("expected_m"),
728
- num_recv_tokens_per_expert=state.pop("num_recv_tokens_per_expert"),
729
- forward_batch=state.forward_batch,
682
+ state.hidden_states_experts_output = self.experts.moe_impl(
683
+ dispatch_output=state.dispatch_output,
730
684
  )
731
685
 
732
686
  def op_combine_a(self, state):
733
687
  if self.ep_size > 1:
734
- self.deepep_dispatcher.combine_a(
688
+ self.experts.deepep_dispatcher.combine_a(
735
689
  hidden_states=state.pop("hidden_states_experts_output"),
736
- topk_idx=state.pop("topk_idx_dispatched"),
737
- topk_weights=state.pop("topk_weights_dispatched"),
690
+ topk_idx=state.dispatch_output.topk_idx,
691
+ topk_weights=state.dispatch_output.topk_weights,
738
692
  forward_batch=state.forward_batch,
739
693
  tbo_subbatch_index=state.get("tbo_subbatch_index"),
740
694
  )
695
+ state.pop("dispatch_output")
741
696
 
742
697
  def op_combine_b(self, state):
743
698
  if self.ep_size > 1:
744
- state.hidden_states_after_combine = self.deepep_dispatcher.combine_b(
745
- tbo_subbatch_index=state.get("tbo_subbatch_index"),
699
+ state.hidden_states_after_combine = (
700
+ self.experts.deepep_dispatcher.combine_b(
701
+ tbo_subbatch_index=state.get("tbo_subbatch_index"),
702
+ )
746
703
  )
747
704
 
748
705
  def op_output(self, state):
@@ -144,19 +144,6 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
144
144
  )
145
145
  self.top_k = config.num_experts_per_tok
146
146
 
147
- self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
148
- group=parallel_state.get_tp_group().device_group,
149
- router_topk=self.top_k,
150
- permute_fusion=True,
151
- num_experts=self.num_experts,
152
- num_local_experts=config.num_experts // self.tp_size,
153
- hidden_size=config.hidden_size,
154
- params_dtype=config.torch_dtype,
155
- deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]],
156
- async_finish=True, # TODO
157
- return_recv_hook=True,
158
- )
159
-
160
147
  def forward(
161
148
  self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
162
149
  ) -> torch.Tensor:
@@ -207,41 +194,12 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
207
194
  topk_weights = torch.empty(
208
195
  (0, self.top_k), dtype=torch.float32, device=hidden_states.device
209
196
  )
210
- if self.ep_size > 1:
211
- # TODO(ch-wan): allow users to set num_max_dispatch_tokens_per_rank value
212
- (
213
- hidden_states,
214
- topk_idx,
215
- topk_weights,
216
- reorder_topk_ids,
217
- num_recv_tokens_per_expert,
218
- seg_indptr,
219
- masked_m,
220
- expected_m,
221
- ) = self.deepep_dispatcher.dispatch(
222
- hidden_states=hidden_states,
223
- topk_idx=topk_idx,
224
- topk_weights=topk_weights,
225
- forward_batch=forward_batch,
226
- )
227
197
  final_hidden_states = self.experts(
228
198
  hidden_states=hidden_states,
229
199
  topk_idx=topk_idx,
230
200
  topk_weights=topk_weights,
231
- reorder_topk_ids=reorder_topk_ids,
232
- seg_indptr=seg_indptr,
233
- masked_m=masked_m,
234
- expected_m=expected_m,
235
- num_recv_tokens_per_expert=num_recv_tokens_per_expert,
236
201
  forward_batch=forward_batch,
237
202
  )
238
- if self.ep_size > 1:
239
- final_hidden_states = self.deepep_dispatcher.combine(
240
- hidden_states=final_hidden_states,
241
- topk_idx=topk_idx,
242
- topk_weights=topk_weights,
243
- forward_batch=forward_batch,
244
- )
245
203
  return final_hidden_states
246
204
 
247
205
  def op_gate(self, state):
@@ -278,8 +236,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
278
236
 
279
237
  def op_dispatch_a(self, state):
280
238
  if self.ep_size > 1:
281
- # TODO(ch-wan): allow users to set num_max_dispatch_tokens_per_rank value
282
- self.deepep_dispatcher.dispatch_a(
239
+ self.experts.deepep_dispatcher.dispatch_a(
283
240
  hidden_states=state.pop("hidden_states_mlp_input"),
284
241
  topk_idx=state.pop("topk_idx_local"),
285
242
  topk_weights=state.pop("topk_weights_local"),
@@ -292,46 +249,32 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
292
249
  with get_global_expert_distribution_recorder().with_current_layer(
293
250
  self.layer_id
294
251
  ):
295
- (
296
- state.hidden_states_experts_input,
297
- state.topk_idx_dispatched,
298
- state.topk_weights_dispatched,
299
- state.reorder_topk_ids,
300
- state.num_recv_tokens_per_expert,
301
- state.seg_indptr,
302
- state.masked_m,
303
- state.expected_m,
304
- ) = self.deepep_dispatcher.dispatch_b(
252
+ state.dispatch_output = self.experts.deepep_dispatcher.dispatch_b(
305
253
  tbo_subbatch_index=state.get("tbo_subbatch_index"),
306
254
  )
307
255
 
308
256
  def op_experts(self, state):
309
- state.hidden_states_experts_output = self.experts(
310
- hidden_states=state.pop("hidden_states_experts_input"),
311
- topk_idx=state.topk_idx_dispatched,
312
- topk_weights=state.topk_weights_dispatched,
313
- reorder_topk_ids=state.pop("reorder_topk_ids"),
314
- seg_indptr=state.pop("seg_indptr"),
315
- masked_m=state.pop("masked_m"),
316
- expected_m=state.pop("expected_m"),
317
- num_recv_tokens_per_expert=state.pop("num_recv_tokens_per_expert"),
318
- forward_batch=state.forward_batch,
257
+ state.hidden_states_experts_output = self.experts.moe_impl(
258
+ dispatch_output=state.dispatch_output,
319
259
  )
320
260
 
321
261
  def op_combine_a(self, state):
322
262
  if self.ep_size > 1:
323
- self.deepep_dispatcher.combine_a(
263
+ self.experts.deepep_dispatcher.combine_a(
324
264
  hidden_states=state.pop("hidden_states_experts_output"),
325
- topk_idx=state.pop("topk_idx_dispatched"),
326
- topk_weights=state.pop("topk_weights_dispatched"),
265
+ topk_idx=state.dispatch_output.topk_idx,
266
+ topk_weights=state.dispatch_output.topk_weights,
327
267
  forward_batch=state.forward_batch,
328
268
  tbo_subbatch_index=state.get("tbo_subbatch_index"),
329
269
  )
270
+ state.pop("dispatch_output")
330
271
 
331
272
  def op_combine_b(self, state):
332
273
  if self.ep_size > 1:
333
- state.hidden_states_after_combine = self.deepep_dispatcher.combine_b(
334
- tbo_subbatch_index=state.get("tbo_subbatch_index"),
274
+ state.hidden_states_after_combine = (
275
+ self.experts.deepep_dispatcher.combine_b(
276
+ tbo_subbatch_index=state.get("tbo_subbatch_index"),
277
+ )
335
278
  )
336
279
 
337
280
  def op_output(self, state):
@@ -0,0 +1,31 @@
1
+ import torch
2
+
3
+ from sglang.srt.distributed import get_world_group
4
+
5
+
6
+ class PollBasedBarrier:
7
+ def __init__(self, noop: bool = False):
8
+ self._noop = noop
9
+ self._local_arrived = False
10
+
11
+ def local_arrive(self):
12
+ assert not self._local_arrived
13
+ self._local_arrived = True
14
+
15
+ def poll_global_arrived(self) -> bool:
16
+ global_arrived = self._compute_global_arrived()
17
+ output = self._local_arrived and global_arrived
18
+ if output:
19
+ self._local_arrived = False
20
+ return output
21
+
22
+ def _compute_global_arrived(self) -> bool:
23
+ local_arrived = self._noop or self._local_arrived
24
+ global_arrived = torch.tensor(local_arrived)
25
+ # Can optimize if bottleneck
26
+ torch.distributed.all_reduce(
27
+ global_arrived,
28
+ torch.distributed.ReduceOp.MIN,
29
+ group=get_world_group().cpu_group,
30
+ )
31
+ return global_arrived.item()
sglang/srt/server_args.py CHANGED
@@ -19,6 +19,7 @@ import json
19
19
  import logging
20
20
  import os
21
21
  import random
22
+ import sys
22
23
  import tempfile
23
24
  from typing import List, Literal, Optional, Union
24
25
 
@@ -74,6 +75,7 @@ class ServerArgs:
74
75
  # Memory and scheduling
75
76
  mem_fraction_static: Optional[float] = None
76
77
  max_running_requests: Optional[int] = None
78
+ max_queued_requests: Optional[int] = sys.maxsize
77
79
  max_total_tokens: Optional[int] = None
78
80
  chunked_prefill_size: Optional[int] = None
79
81
  max_prefill_tokens: int = 16384
@@ -805,6 +807,12 @@ class ServerArgs:
805
807
  default=ServerArgs.max_running_requests,
806
808
  help="The maximum number of running requests.",
807
809
  )
810
+ parser.add_argument(
811
+ "--max-queued-requests",
812
+ type=int,
813
+ default=ServerArgs.max_queued_requests,
814
+ help="The maximum number of queued requests. This option is ignored when using disaggregation-mode.",
815
+ )
808
816
  parser.add_argument(
809
817
  "--max-total-tokens",
810
818
  type=int,
@@ -1,7 +1,9 @@
1
+ from __future__ import annotations
2
+
1
3
  import dataclasses
2
4
  import logging
3
5
  from dataclasses import replace
4
- from typing import Dict, List, Optional, Sequence, Union
6
+ from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Union
5
7
 
6
8
  import torch
7
9
 
@@ -20,6 +22,9 @@ from sglang.srt.operations_strategy import OperationsStrategy
20
22
  from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
21
23
  from sglang.srt.utils import BumpAllocator, DeepEPMode, get_bool_env_var
22
24
 
25
+ if TYPE_CHECKING:
26
+ from sglang.srt.layers.moe.ep_moe.token_dispatcher import DispatchOutput
27
+
23
28
  _tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
24
29
 
25
30
  logger = logging.getLogger(__name__)
@@ -802,7 +807,7 @@ class MaybeTboDeepEPDispatcher:
802
807
  def _execute(self, name, tbo_subbatch_index: Optional[int] = None, **kwargs):
803
808
  return getattr(self._inners[tbo_subbatch_index or 0], name)(**kwargs)
804
809
 
805
- def dispatch(self, **kwargs):
810
+ def dispatch(self, **kwargs) -> DispatchOutput:
806
811
  return self._execute("dispatch", **kwargs)
807
812
 
808
813
  def dispatch_a(self, **kwargs):
@@ -811,7 +816,7 @@ class MaybeTboDeepEPDispatcher:
811
816
  def dispatch_b(self, **kwargs):
812
817
  return self._execute("dispatch_b", **kwargs)
813
818
 
814
- def combine(self, **kwargs):
819
+ def combine(self, **kwargs) -> torch.Tensor:
815
820
  return self._execute("combine", **kwargs)
816
821
 
817
822
  def combine_a(self, **kwargs):
sglang/test/test_utils.py CHANGED
@@ -19,6 +19,7 @@ from pathlib import Path
19
19
  from types import SimpleNamespace
20
20
  from typing import Awaitable, Callable, List, Optional, Tuple
21
21
 
22
+ import aiohttp
22
23
  import numpy as np
23
24
  import requests
24
25
  import torch
@@ -1303,6 +1304,58 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
1303
1304
  raise
1304
1305
 
1305
1306
 
1307
+ def send_generate_requests(base_url: str, num_requests: int) -> List[str]:
1308
+ """Sends generate request serially and returns status codes. Max concurrency is 1."""
1309
+
1310
+ def generate():
1311
+ prompt = """
1312
+ System: You are a helpful assistant.
1313
+ User: What is the capital of France?
1314
+ Assistant: The capital of France is
1315
+ """
1316
+ response = requests.post(
1317
+ f"{base_url}/generate",
1318
+ json={
1319
+ "text": prompt,
1320
+ "sampling_params": {
1321
+ "temperature": 0,
1322
+ "max_new_tokens": 50,
1323
+ },
1324
+ },
1325
+ )
1326
+ return response.status_code
1327
+
1328
+ return [generate() for _ in range(num_requests)]
1329
+
1330
+
1331
+ async def send_concurrent_generate_requests(
1332
+ base_url: str, num_requests: int
1333
+ ) -> List[str]:
1334
+ """Sends generate request concurrently and returns status codes. Max concurrency is num_requests."""
1335
+
1336
+ async def async_generate():
1337
+ async with aiohttp.ClientSession() as session:
1338
+ prompt = """
1339
+ System: You are a helpful assistant.
1340
+ User: What is the capital of France?
1341
+ Assistant: The capital of France is
1342
+ """
1343
+ async with session.post(
1344
+ f"{base_url}/generate",
1345
+ json={
1346
+ "text": prompt,
1347
+ "sampling_params": {
1348
+ "temperature": 0,
1349
+ "max_new_tokens": 50,
1350
+ },
1351
+ },
1352
+ ) as response:
1353
+ return response.status
1354
+
1355
+ tasks = [asyncio.create_task(async_generate()) for _ in range(num_requests)]
1356
+ return await asyncio.gather(*tasks)
1357
+
1358
+
1306
1359
  class CustomTestCase(unittest.TestCase):
1307
1360
  def _callTestMethod(self, method):
1308
1361
  max_retry = int(
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.9.post5"
1
+ __version__ = "0.4.9.post6"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.9.post5
3
+ Version: 0.4.9.post6
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -269,6 +269,7 @@ Requires-Dist: torchvision==0.22.1; extra == "blackwell"
269
269
  Requires-Dist: cuda-python; extra == "blackwell"
270
270
  Requires-Dist: einops; extra == "blackwell"
271
271
  Requires-Dist: flashinfer_python==0.2.9rc2; extra == "blackwell"
272
+ Requires-Dist: tiktoken; extra == "blackwell"
272
273
  Provides-Extra: srt-hip
273
274
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
274
275
  Requires-Dist: torch; extra == "srt-hip"