sglang 0.4.9.post5__py3-none-any.whl → 0.4.9.post6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/srt/configs/model_config.py +3 -0
- sglang/srt/entrypoints/http_server.py +13 -1
- sglang/srt/entrypoints/openai/protocol.py +3 -1
- sglang/srt/entrypoints/openai/serving_base.py +5 -2
- sglang/srt/layers/moe/ep_moe/layer.py +152 -37
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
- sglang/srt/layers/moe/topk.py +6 -2
- sglang/srt/layers/quantization/modelopt_quant.py +2 -0
- sglang/srt/managers/data_parallel_controller.py +4 -0
- sglang/srt/managers/io_struct.py +12 -0
- sglang/srt/managers/scheduler.py +29 -0
- sglang/srt/managers/scheduler_input_blocker.py +106 -0
- sglang/srt/managers/tokenizer_manager.py +43 -9
- sglang/srt/managers/tp_worker.py +5 -0
- sglang/srt/model_executor/model_runner.py +15 -13
- sglang/srt/models/deepseek_v2.py +13 -56
- sglang/srt/models/qwen3_moe.py +12 -69
- sglang/srt/poll_based_barrier.py +31 -0
- sglang/srt/server_args.py +8 -0
- sglang/srt/two_batch_overlap.py +8 -3
- sglang/test/test_utils.py +53 -0
- sglang/version.py +1 -1
- {sglang-0.4.9.post5.dist-info → sglang-0.4.9.post6.dist-info}/METADATA +2 -1
- {sglang-0.4.9.post5.dist-info → sglang-0.4.9.post6.dist-info}/RECORD +32 -25
- {sglang-0.4.9.post5.dist-info → sglang-0.4.9.post6.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post5.dist-info → sglang-0.4.9.post6.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post5.dist-info → sglang-0.4.9.post6.dist-info}/top_level.txt +0 -0
@@ -27,6 +27,7 @@ import threading
|
|
27
27
|
import time
|
28
28
|
import uuid
|
29
29
|
from collections import deque
|
30
|
+
from contextlib import nullcontext
|
30
31
|
from datetime import datetime
|
31
32
|
from http import HTTPStatus
|
32
33
|
from typing import (
|
@@ -69,6 +70,7 @@ from sglang.srt.managers.io_struct import (
|
|
69
70
|
BatchMultimodalOut,
|
70
71
|
BatchStrOut,
|
71
72
|
BatchTokenIDOut,
|
73
|
+
BlockReqType,
|
72
74
|
CloseSessionReqInput,
|
73
75
|
ConfigureLoggingReq,
|
74
76
|
EmbeddingReqInput,
|
@@ -114,6 +116,7 @@ from sglang.srt.managers.io_struct import (
|
|
114
116
|
)
|
115
117
|
from sglang.srt.managers.mm_utils import TensorTransportMode
|
116
118
|
from sglang.srt.managers.multimodal_processor import get_mm_processor, import_processors
|
119
|
+
from sglang.srt.managers.scheduler_input_blocker import input_blocker_guard_region
|
117
120
|
from sglang.srt.metrics.collector import TokenizerMetricsCollector
|
118
121
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
119
122
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
@@ -766,6 +769,19 @@ class TokenizerManager:
|
|
766
769
|
):
|
767
770
|
raise ValueError(finish_reason["message"])
|
768
771
|
|
772
|
+
if (
|
773
|
+
finish_reason.get("type") == "abort"
|
774
|
+
and finish_reason.get("status_code")
|
775
|
+
== HTTPStatus.SERVICE_UNAVAILABLE
|
776
|
+
):
|
777
|
+
# This is an abort request initiated by scheduler.
|
778
|
+
# Delete the key to prevent resending abort request to the scheduler and
|
779
|
+
# to ensure aborted request state is cleaned up.
|
780
|
+
del self.rid_to_state[state.obj.rid]
|
781
|
+
raise fastapi.HTTPException(
|
782
|
+
status_code=finish_reason["status_code"],
|
783
|
+
detail=finish_reason["message"],
|
784
|
+
)
|
769
785
|
yield out
|
770
786
|
break
|
771
787
|
|
@@ -806,12 +822,21 @@ class TokenizerManager:
|
|
806
822
|
rids.append(tmp_obj.rid)
|
807
823
|
else:
|
808
824
|
# Sequential tokenization and processing
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
825
|
+
with (
|
826
|
+
input_blocker_guard_region(send_to_scheduler=self.send_to_scheduler)
|
827
|
+
if get_bool_env_var("SGLANG_ENABLE_COLOCATED_BATCH_GEN")
|
828
|
+
else nullcontext()
|
829
|
+
):
|
830
|
+
for i in range(batch_size):
|
831
|
+
tmp_obj = obj[i]
|
832
|
+
tokenized_obj = await self._tokenize_one_request(tmp_obj)
|
833
|
+
state = self._send_one_request(
|
834
|
+
tmp_obj, tokenized_obj, created_time
|
835
|
+
)
|
836
|
+
generators.append(
|
837
|
+
self._wait_one_response(tmp_obj, state, request)
|
838
|
+
)
|
839
|
+
rids.append(tmp_obj.rid)
|
815
840
|
else:
|
816
841
|
# FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
|
817
842
|
if batch_size > 128:
|
@@ -1705,8 +1730,15 @@ class TokenizerManager:
|
|
1705
1730
|
def _handle_abort_req(self, recv_obj):
|
1706
1731
|
state = self.rid_to_state[recv_obj.rid]
|
1707
1732
|
state.finished = True
|
1708
|
-
|
1709
|
-
{
|
1733
|
+
if recv_obj.finished_reason:
|
1734
|
+
out = {
|
1735
|
+
"meta_info": {
|
1736
|
+
"id": recv_obj.rid,
|
1737
|
+
"finish_reason": recv_obj.finished_reason,
|
1738
|
+
},
|
1739
|
+
}
|
1740
|
+
else:
|
1741
|
+
out = {
|
1710
1742
|
"text": "",
|
1711
1743
|
"meta_info": {
|
1712
1744
|
"id": recv_obj.rid,
|
@@ -1718,7 +1750,7 @@ class TokenizerManager:
|
|
1718
1750
|
"completion_tokens": 0,
|
1719
1751
|
},
|
1720
1752
|
}
|
1721
|
-
)
|
1753
|
+
state.out_list.append(out)
|
1722
1754
|
state.event.set()
|
1723
1755
|
|
1724
1756
|
def _handle_open_session_req_output(self, recv_obj):
|
@@ -1910,8 +1942,10 @@ class _Communicator(Generic[T]):
|
|
1910
1942
|
#
|
1911
1943
|
# | entrypoint | is_streaming | status | abort engine | cancel asyncio task | rid_to_state |
|
1912
1944
|
# | ---------- | ------------ | --------------- | --------------- | --------------------- | --------------------------- |
|
1945
|
+
# | http | yes | validation | background task | fast api | del in _handle_abort_req |
|
1913
1946
|
# | http | yes | waiting queue | background task | fast api | del in _handle_abort_req |
|
1914
1947
|
# | http | yes | running | background task | fast api | del in _handle_batch_output |
|
1948
|
+
# | http | no | validation | http exception | http exception | del in _handle_abort_req |
|
1915
1949
|
# | http | no | waiting queue | type 1 | type 1 exception | del in _handle_abort_req |
|
1916
1950
|
# | http | no | running | type 3 | type 3 exception | del in _handle_batch_output |
|
1917
1951
|
#
|
sglang/srt/managers/tp_worker.py
CHANGED
@@ -130,6 +130,10 @@ class TpModelWorker:
|
|
130
130
|
self.model_runner.req_to_token_pool.size,
|
131
131
|
)
|
132
132
|
assert self.max_running_requests > 0, "max_running_request is zero"
|
133
|
+
self.max_queued_requests = server_args.max_queued_requests
|
134
|
+
assert (
|
135
|
+
self.max_running_requests > 0
|
136
|
+
), "max_queued_requests is zero. We need to be at least 1 to schedule a request."
|
133
137
|
self.max_req_len = min(
|
134
138
|
self.model_config.context_len - 1,
|
135
139
|
self.max_total_num_tokens - 1,
|
@@ -165,6 +169,7 @@ class TpModelWorker:
|
|
165
169
|
self.max_total_num_tokens,
|
166
170
|
self.max_prefill_tokens,
|
167
171
|
self.max_running_requests,
|
172
|
+
self.max_queued_requests,
|
168
173
|
self.max_req_len,
|
169
174
|
self.max_req_input_len,
|
170
175
|
self.random_seed,
|
@@ -285,11 +285,21 @@ class ModelRunner:
|
|
285
285
|
if architectures and not any("Llama4" in arch for arch in architectures):
|
286
286
|
self.is_hybrid = self.model_config.is_hybrid = True
|
287
287
|
|
288
|
-
|
289
|
-
|
290
|
-
|
288
|
+
# For MTP models like DeepSeek-V3 or GLM-4.5, the MTP layer(s) are used separately as draft
|
289
|
+
# models for speculative decoding. In those cases, `num_nextn_predict_layers` is used to
|
290
|
+
# determine the number of layers.
|
291
|
+
model_has_mtp_layers = self.model_config.num_nextn_predict_layers is not None
|
292
|
+
model_num_layers = (
|
293
|
+
self.model_config.num_nextn_predict_layers
|
294
|
+
if self.is_draft_worker and model_has_mtp_layers
|
295
|
+
else self.model_config.num_hidden_layers
|
291
296
|
)
|
297
|
+
self.start_layer = getattr(self.model, "start_layer", 0)
|
298
|
+
self.end_layer = getattr(self.model, "end_layer", model_num_layers)
|
292
299
|
self.num_effective_layers = self.end_layer - self.start_layer
|
300
|
+
assert (not model_has_mtp_layers) or (
|
301
|
+
self.num_effective_layers == model_num_layers
|
302
|
+
), "PP is not compatible with MTP models."
|
293
303
|
|
294
304
|
# Apply torchao quantization
|
295
305
|
torchao_applied = getattr(self.model, "torchao_applied", False)
|
@@ -1178,11 +1188,7 @@ class ModelRunner:
|
|
1178
1188
|
dtype=self.kv_cache_dtype,
|
1179
1189
|
kv_lora_rank=self.model_config.kv_lora_rank,
|
1180
1190
|
qk_rope_head_dim=self.model_config.qk_rope_head_dim,
|
1181
|
-
layer_num=
|
1182
|
-
self.model_config.num_hidden_layers
|
1183
|
-
if not self.is_draft_worker
|
1184
|
-
else self.model_config.hf_config.num_nextn_predict_layers
|
1185
|
-
), # PP is not compatible with mla backend
|
1191
|
+
layer_num=self.num_effective_layers,
|
1186
1192
|
device=self.device,
|
1187
1193
|
enable_memory_saver=self.server_args.enable_memory_saver,
|
1188
1194
|
start_layer=self.start_layer,
|
@@ -1195,11 +1201,7 @@ class ModelRunner:
|
|
1195
1201
|
dtype=self.kv_cache_dtype,
|
1196
1202
|
kv_lora_rank=self.model_config.kv_lora_rank,
|
1197
1203
|
qk_rope_head_dim=self.model_config.qk_rope_head_dim,
|
1198
|
-
layer_num=
|
1199
|
-
self.model_config.num_hidden_layers
|
1200
|
-
if not self.is_draft_worker
|
1201
|
-
else self.model_config.hf_config.num_nextn_predict_layers
|
1202
|
-
), # PP is not compatible with mla backend
|
1204
|
+
layer_num=self.num_effective_layers,
|
1203
1205
|
device=self.device,
|
1204
1206
|
enable_memory_saver=self.server_args.enable_memory_saver,
|
1205
1207
|
start_layer=self.start_layer,
|
sglang/srt/models/deepseek_v2.py
CHANGED
@@ -594,41 +594,13 @@ class DeepseekV2MoE(nn.Module):
|
|
594
594
|
topk_weights = torch.empty(
|
595
595
|
(0, self.top_k), dtype=torch.float32, device=hidden_states.device
|
596
596
|
)
|
597
|
-
|
598
|
-
# TODO(ch-wan): allow users to set num_max_dispatch_tokens_per_rank value
|
599
|
-
(
|
600
|
-
hidden_states,
|
601
|
-
topk_idx,
|
602
|
-
topk_weights,
|
603
|
-
reorder_topk_ids,
|
604
|
-
num_recv_tokens_per_expert,
|
605
|
-
seg_indptr,
|
606
|
-
masked_m,
|
607
|
-
expected_m,
|
608
|
-
) = self.deepep_dispatcher.dispatch(
|
609
|
-
hidden_states=hidden_states,
|
610
|
-
topk_idx=topk_idx,
|
611
|
-
topk_weights=topk_weights,
|
612
|
-
forward_batch=forward_batch,
|
613
|
-
)
|
597
|
+
|
614
598
|
final_hidden_states = self.experts(
|
615
599
|
hidden_states=hidden_states,
|
616
600
|
topk_idx=topk_idx,
|
617
601
|
topk_weights=topk_weights,
|
618
|
-
reorder_topk_ids=reorder_topk_ids,
|
619
|
-
seg_indptr=seg_indptr,
|
620
|
-
masked_m=masked_m,
|
621
|
-
expected_m=expected_m,
|
622
|
-
num_recv_tokens_per_expert=num_recv_tokens_per_expert,
|
623
602
|
forward_batch=forward_batch,
|
624
603
|
)
|
625
|
-
if self.ep_size > 1:
|
626
|
-
final_hidden_states = self.deepep_dispatcher.combine(
|
627
|
-
hidden_states=final_hidden_states,
|
628
|
-
topk_idx=topk_idx,
|
629
|
-
topk_weights=topk_weights,
|
630
|
-
forward_batch=forward_batch,
|
631
|
-
)
|
632
604
|
|
633
605
|
if shared_output is not None:
|
634
606
|
x = shared_output
|
@@ -689,8 +661,7 @@ class DeepseekV2MoE(nn.Module):
|
|
689
661
|
|
690
662
|
def op_dispatch_a(self, state):
|
691
663
|
if self.ep_size > 1:
|
692
|
-
|
693
|
-
self.deepep_dispatcher.dispatch_a(
|
664
|
+
self.experts.deepep_dispatcher.dispatch_a(
|
694
665
|
hidden_states=state.hidden_states_mlp_input,
|
695
666
|
topk_idx=state.pop("topk_idx_local"),
|
696
667
|
topk_weights=state.pop("topk_weights_local"),
|
@@ -703,46 +674,32 @@ class DeepseekV2MoE(nn.Module):
|
|
703
674
|
with get_global_expert_distribution_recorder().with_current_layer(
|
704
675
|
self.layer_id
|
705
676
|
):
|
706
|
-
(
|
707
|
-
state.hidden_states_experts_input,
|
708
|
-
state.topk_idx_dispatched,
|
709
|
-
state.topk_weights_dispatched,
|
710
|
-
state.reorder_topk_ids,
|
711
|
-
state.num_recv_tokens_per_expert,
|
712
|
-
state.seg_indptr,
|
713
|
-
state.masked_m,
|
714
|
-
state.expected_m,
|
715
|
-
) = self.deepep_dispatcher.dispatch_b(
|
677
|
+
state.dispatch_output = self.experts.deepep_dispatcher.dispatch_b(
|
716
678
|
tbo_subbatch_index=state.get("tbo_subbatch_index"),
|
717
679
|
)
|
718
680
|
|
719
681
|
def op_experts(self, state):
|
720
|
-
state.hidden_states_experts_output = self.experts(
|
721
|
-
|
722
|
-
topk_idx=state.topk_idx_dispatched,
|
723
|
-
topk_weights=state.topk_weights_dispatched,
|
724
|
-
reorder_topk_ids=state.pop("reorder_topk_ids"),
|
725
|
-
seg_indptr=state.pop("seg_indptr"),
|
726
|
-
masked_m=state.pop("masked_m"),
|
727
|
-
expected_m=state.pop("expected_m"),
|
728
|
-
num_recv_tokens_per_expert=state.pop("num_recv_tokens_per_expert"),
|
729
|
-
forward_batch=state.forward_batch,
|
682
|
+
state.hidden_states_experts_output = self.experts.moe_impl(
|
683
|
+
dispatch_output=state.dispatch_output,
|
730
684
|
)
|
731
685
|
|
732
686
|
def op_combine_a(self, state):
|
733
687
|
if self.ep_size > 1:
|
734
|
-
self.deepep_dispatcher.combine_a(
|
688
|
+
self.experts.deepep_dispatcher.combine_a(
|
735
689
|
hidden_states=state.pop("hidden_states_experts_output"),
|
736
|
-
topk_idx=state.
|
737
|
-
topk_weights=state.
|
690
|
+
topk_idx=state.dispatch_output.topk_idx,
|
691
|
+
topk_weights=state.dispatch_output.topk_weights,
|
738
692
|
forward_batch=state.forward_batch,
|
739
693
|
tbo_subbatch_index=state.get("tbo_subbatch_index"),
|
740
694
|
)
|
695
|
+
state.pop("dispatch_output")
|
741
696
|
|
742
697
|
def op_combine_b(self, state):
|
743
698
|
if self.ep_size > 1:
|
744
|
-
state.hidden_states_after_combine =
|
745
|
-
|
699
|
+
state.hidden_states_after_combine = (
|
700
|
+
self.experts.deepep_dispatcher.combine_b(
|
701
|
+
tbo_subbatch_index=state.get("tbo_subbatch_index"),
|
702
|
+
)
|
746
703
|
)
|
747
704
|
|
748
705
|
def op_output(self, state):
|
sglang/srt/models/qwen3_moe.py
CHANGED
@@ -144,19 +144,6 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
144
144
|
)
|
145
145
|
self.top_k = config.num_experts_per_tok
|
146
146
|
|
147
|
-
self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
|
148
|
-
group=parallel_state.get_tp_group().device_group,
|
149
|
-
router_topk=self.top_k,
|
150
|
-
permute_fusion=True,
|
151
|
-
num_experts=self.num_experts,
|
152
|
-
num_local_experts=config.num_experts // self.tp_size,
|
153
|
-
hidden_size=config.hidden_size,
|
154
|
-
params_dtype=config.torch_dtype,
|
155
|
-
deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]],
|
156
|
-
async_finish=True, # TODO
|
157
|
-
return_recv_hook=True,
|
158
|
-
)
|
159
|
-
|
160
147
|
def forward(
|
161
148
|
self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
|
162
149
|
) -> torch.Tensor:
|
@@ -207,41 +194,12 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
207
194
|
topk_weights = torch.empty(
|
208
195
|
(0, self.top_k), dtype=torch.float32, device=hidden_states.device
|
209
196
|
)
|
210
|
-
if self.ep_size > 1:
|
211
|
-
# TODO(ch-wan): allow users to set num_max_dispatch_tokens_per_rank value
|
212
|
-
(
|
213
|
-
hidden_states,
|
214
|
-
topk_idx,
|
215
|
-
topk_weights,
|
216
|
-
reorder_topk_ids,
|
217
|
-
num_recv_tokens_per_expert,
|
218
|
-
seg_indptr,
|
219
|
-
masked_m,
|
220
|
-
expected_m,
|
221
|
-
) = self.deepep_dispatcher.dispatch(
|
222
|
-
hidden_states=hidden_states,
|
223
|
-
topk_idx=topk_idx,
|
224
|
-
topk_weights=topk_weights,
|
225
|
-
forward_batch=forward_batch,
|
226
|
-
)
|
227
197
|
final_hidden_states = self.experts(
|
228
198
|
hidden_states=hidden_states,
|
229
199
|
topk_idx=topk_idx,
|
230
200
|
topk_weights=topk_weights,
|
231
|
-
reorder_topk_ids=reorder_topk_ids,
|
232
|
-
seg_indptr=seg_indptr,
|
233
|
-
masked_m=masked_m,
|
234
|
-
expected_m=expected_m,
|
235
|
-
num_recv_tokens_per_expert=num_recv_tokens_per_expert,
|
236
201
|
forward_batch=forward_batch,
|
237
202
|
)
|
238
|
-
if self.ep_size > 1:
|
239
|
-
final_hidden_states = self.deepep_dispatcher.combine(
|
240
|
-
hidden_states=final_hidden_states,
|
241
|
-
topk_idx=topk_idx,
|
242
|
-
topk_weights=topk_weights,
|
243
|
-
forward_batch=forward_batch,
|
244
|
-
)
|
245
203
|
return final_hidden_states
|
246
204
|
|
247
205
|
def op_gate(self, state):
|
@@ -278,8 +236,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
278
236
|
|
279
237
|
def op_dispatch_a(self, state):
|
280
238
|
if self.ep_size > 1:
|
281
|
-
|
282
|
-
self.deepep_dispatcher.dispatch_a(
|
239
|
+
self.experts.deepep_dispatcher.dispatch_a(
|
283
240
|
hidden_states=state.pop("hidden_states_mlp_input"),
|
284
241
|
topk_idx=state.pop("topk_idx_local"),
|
285
242
|
topk_weights=state.pop("topk_weights_local"),
|
@@ -292,46 +249,32 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
292
249
|
with get_global_expert_distribution_recorder().with_current_layer(
|
293
250
|
self.layer_id
|
294
251
|
):
|
295
|
-
(
|
296
|
-
state.hidden_states_experts_input,
|
297
|
-
state.topk_idx_dispatched,
|
298
|
-
state.topk_weights_dispatched,
|
299
|
-
state.reorder_topk_ids,
|
300
|
-
state.num_recv_tokens_per_expert,
|
301
|
-
state.seg_indptr,
|
302
|
-
state.masked_m,
|
303
|
-
state.expected_m,
|
304
|
-
) = self.deepep_dispatcher.dispatch_b(
|
252
|
+
state.dispatch_output = self.experts.deepep_dispatcher.dispatch_b(
|
305
253
|
tbo_subbatch_index=state.get("tbo_subbatch_index"),
|
306
254
|
)
|
307
255
|
|
308
256
|
def op_experts(self, state):
|
309
|
-
state.hidden_states_experts_output = self.experts(
|
310
|
-
|
311
|
-
topk_idx=state.topk_idx_dispatched,
|
312
|
-
topk_weights=state.topk_weights_dispatched,
|
313
|
-
reorder_topk_ids=state.pop("reorder_topk_ids"),
|
314
|
-
seg_indptr=state.pop("seg_indptr"),
|
315
|
-
masked_m=state.pop("masked_m"),
|
316
|
-
expected_m=state.pop("expected_m"),
|
317
|
-
num_recv_tokens_per_expert=state.pop("num_recv_tokens_per_expert"),
|
318
|
-
forward_batch=state.forward_batch,
|
257
|
+
state.hidden_states_experts_output = self.experts.moe_impl(
|
258
|
+
dispatch_output=state.dispatch_output,
|
319
259
|
)
|
320
260
|
|
321
261
|
def op_combine_a(self, state):
|
322
262
|
if self.ep_size > 1:
|
323
|
-
self.deepep_dispatcher.combine_a(
|
263
|
+
self.experts.deepep_dispatcher.combine_a(
|
324
264
|
hidden_states=state.pop("hidden_states_experts_output"),
|
325
|
-
topk_idx=state.
|
326
|
-
topk_weights=state.
|
265
|
+
topk_idx=state.dispatch_output.topk_idx,
|
266
|
+
topk_weights=state.dispatch_output.topk_weights,
|
327
267
|
forward_batch=state.forward_batch,
|
328
268
|
tbo_subbatch_index=state.get("tbo_subbatch_index"),
|
329
269
|
)
|
270
|
+
state.pop("dispatch_output")
|
330
271
|
|
331
272
|
def op_combine_b(self, state):
|
332
273
|
if self.ep_size > 1:
|
333
|
-
state.hidden_states_after_combine =
|
334
|
-
|
274
|
+
state.hidden_states_after_combine = (
|
275
|
+
self.experts.deepep_dispatcher.combine_b(
|
276
|
+
tbo_subbatch_index=state.get("tbo_subbatch_index"),
|
277
|
+
)
|
335
278
|
)
|
336
279
|
|
337
280
|
def op_output(self, state):
|
@@ -0,0 +1,31 @@
|
|
1
|
+
import torch
|
2
|
+
|
3
|
+
from sglang.srt.distributed import get_world_group
|
4
|
+
|
5
|
+
|
6
|
+
class PollBasedBarrier:
|
7
|
+
def __init__(self, noop: bool = False):
|
8
|
+
self._noop = noop
|
9
|
+
self._local_arrived = False
|
10
|
+
|
11
|
+
def local_arrive(self):
|
12
|
+
assert not self._local_arrived
|
13
|
+
self._local_arrived = True
|
14
|
+
|
15
|
+
def poll_global_arrived(self) -> bool:
|
16
|
+
global_arrived = self._compute_global_arrived()
|
17
|
+
output = self._local_arrived and global_arrived
|
18
|
+
if output:
|
19
|
+
self._local_arrived = False
|
20
|
+
return output
|
21
|
+
|
22
|
+
def _compute_global_arrived(self) -> bool:
|
23
|
+
local_arrived = self._noop or self._local_arrived
|
24
|
+
global_arrived = torch.tensor(local_arrived)
|
25
|
+
# Can optimize if bottleneck
|
26
|
+
torch.distributed.all_reduce(
|
27
|
+
global_arrived,
|
28
|
+
torch.distributed.ReduceOp.MIN,
|
29
|
+
group=get_world_group().cpu_group,
|
30
|
+
)
|
31
|
+
return global_arrived.item()
|
sglang/srt/server_args.py
CHANGED
@@ -19,6 +19,7 @@ import json
|
|
19
19
|
import logging
|
20
20
|
import os
|
21
21
|
import random
|
22
|
+
import sys
|
22
23
|
import tempfile
|
23
24
|
from typing import List, Literal, Optional, Union
|
24
25
|
|
@@ -74,6 +75,7 @@ class ServerArgs:
|
|
74
75
|
# Memory and scheduling
|
75
76
|
mem_fraction_static: Optional[float] = None
|
76
77
|
max_running_requests: Optional[int] = None
|
78
|
+
max_queued_requests: Optional[int] = sys.maxsize
|
77
79
|
max_total_tokens: Optional[int] = None
|
78
80
|
chunked_prefill_size: Optional[int] = None
|
79
81
|
max_prefill_tokens: int = 16384
|
@@ -805,6 +807,12 @@ class ServerArgs:
|
|
805
807
|
default=ServerArgs.max_running_requests,
|
806
808
|
help="The maximum number of running requests.",
|
807
809
|
)
|
810
|
+
parser.add_argument(
|
811
|
+
"--max-queued-requests",
|
812
|
+
type=int,
|
813
|
+
default=ServerArgs.max_queued_requests,
|
814
|
+
help="The maximum number of queued requests. This option is ignored when using disaggregation-mode.",
|
815
|
+
)
|
808
816
|
parser.add_argument(
|
809
817
|
"--max-total-tokens",
|
810
818
|
type=int,
|
sglang/srt/two_batch_overlap.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import dataclasses
|
2
4
|
import logging
|
3
5
|
from dataclasses import replace
|
4
|
-
from typing import Dict, List, Optional, Sequence, Union
|
6
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Union
|
5
7
|
|
6
8
|
import torch
|
7
9
|
|
@@ -20,6 +22,9 @@ from sglang.srt.operations_strategy import OperationsStrategy
|
|
20
22
|
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
21
23
|
from sglang.srt.utils import BumpAllocator, DeepEPMode, get_bool_env_var
|
22
24
|
|
25
|
+
if TYPE_CHECKING:
|
26
|
+
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DispatchOutput
|
27
|
+
|
23
28
|
_tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
|
24
29
|
|
25
30
|
logger = logging.getLogger(__name__)
|
@@ -802,7 +807,7 @@ class MaybeTboDeepEPDispatcher:
|
|
802
807
|
def _execute(self, name, tbo_subbatch_index: Optional[int] = None, **kwargs):
|
803
808
|
return getattr(self._inners[tbo_subbatch_index or 0], name)(**kwargs)
|
804
809
|
|
805
|
-
def dispatch(self, **kwargs):
|
810
|
+
def dispatch(self, **kwargs) -> DispatchOutput:
|
806
811
|
return self._execute("dispatch", **kwargs)
|
807
812
|
|
808
813
|
def dispatch_a(self, **kwargs):
|
@@ -811,7 +816,7 @@ class MaybeTboDeepEPDispatcher:
|
|
811
816
|
def dispatch_b(self, **kwargs):
|
812
817
|
return self._execute("dispatch_b", **kwargs)
|
813
818
|
|
814
|
-
def combine(self, **kwargs):
|
819
|
+
def combine(self, **kwargs) -> torch.Tensor:
|
815
820
|
return self._execute("combine", **kwargs)
|
816
821
|
|
817
822
|
def combine_a(self, **kwargs):
|
sglang/test/test_utils.py
CHANGED
@@ -19,6 +19,7 @@ from pathlib import Path
|
|
19
19
|
from types import SimpleNamespace
|
20
20
|
from typing import Awaitable, Callable, List, Optional, Tuple
|
21
21
|
|
22
|
+
import aiohttp
|
22
23
|
import numpy as np
|
23
24
|
import requests
|
24
25
|
import torch
|
@@ -1303,6 +1304,58 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
|
|
1303
1304
|
raise
|
1304
1305
|
|
1305
1306
|
|
1307
|
+
def send_generate_requests(base_url: str, num_requests: int) -> List[str]:
|
1308
|
+
"""Sends generate request serially and returns status codes. Max concurrency is 1."""
|
1309
|
+
|
1310
|
+
def generate():
|
1311
|
+
prompt = """
|
1312
|
+
System: You are a helpful assistant.
|
1313
|
+
User: What is the capital of France?
|
1314
|
+
Assistant: The capital of France is
|
1315
|
+
"""
|
1316
|
+
response = requests.post(
|
1317
|
+
f"{base_url}/generate",
|
1318
|
+
json={
|
1319
|
+
"text": prompt,
|
1320
|
+
"sampling_params": {
|
1321
|
+
"temperature": 0,
|
1322
|
+
"max_new_tokens": 50,
|
1323
|
+
},
|
1324
|
+
},
|
1325
|
+
)
|
1326
|
+
return response.status_code
|
1327
|
+
|
1328
|
+
return [generate() for _ in range(num_requests)]
|
1329
|
+
|
1330
|
+
|
1331
|
+
async def send_concurrent_generate_requests(
|
1332
|
+
base_url: str, num_requests: int
|
1333
|
+
) -> List[str]:
|
1334
|
+
"""Sends generate request concurrently and returns status codes. Max concurrency is num_requests."""
|
1335
|
+
|
1336
|
+
async def async_generate():
|
1337
|
+
async with aiohttp.ClientSession() as session:
|
1338
|
+
prompt = """
|
1339
|
+
System: You are a helpful assistant.
|
1340
|
+
User: What is the capital of France?
|
1341
|
+
Assistant: The capital of France is
|
1342
|
+
"""
|
1343
|
+
async with session.post(
|
1344
|
+
f"{base_url}/generate",
|
1345
|
+
json={
|
1346
|
+
"text": prompt,
|
1347
|
+
"sampling_params": {
|
1348
|
+
"temperature": 0,
|
1349
|
+
"max_new_tokens": 50,
|
1350
|
+
},
|
1351
|
+
},
|
1352
|
+
) as response:
|
1353
|
+
return response.status
|
1354
|
+
|
1355
|
+
tasks = [asyncio.create_task(async_generate()) for _ in range(num_requests)]
|
1356
|
+
return await asyncio.gather(*tasks)
|
1357
|
+
|
1358
|
+
|
1306
1359
|
class CustomTestCase(unittest.TestCase):
|
1307
1360
|
def _callTestMethod(self, method):
|
1308
1361
|
max_retry = int(
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.9.
|
1
|
+
__version__ = "0.4.9.post6"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.9.
|
3
|
+
Version: 0.4.9.post6
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -269,6 +269,7 @@ Requires-Dist: torchvision==0.22.1; extra == "blackwell"
|
|
269
269
|
Requires-Dist: cuda-python; extra == "blackwell"
|
270
270
|
Requires-Dist: einops; extra == "blackwell"
|
271
271
|
Requires-Dist: flashinfer_python==0.2.9rc2; extra == "blackwell"
|
272
|
+
Requires-Dist: tiktoken; extra == "blackwell"
|
272
273
|
Provides-Extra: srt-hip
|
273
274
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
274
275
|
Requires-Dist: torch; extra == "srt-hip"
|