sglang 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +149 -34
- sglang/bench_serving.py +73 -14
- sglang/compile_deep_gemm.py +13 -7
- sglang/launch_server.py +2 -0
- sglang/srt/batch_invariant_ops/__init__.py +2 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +221 -4
- sglang/srt/checkpoint_engine/__init__.py +9 -0
- sglang/srt/checkpoint_engine/update.py +317 -0
- sglang/srt/compilation/backend.py +1 -1
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/deepseek_ocr.py +542 -10
- sglang/srt/configs/deepseekvl2.py +95 -194
- sglang/srt/configs/kimi_linear.py +160 -0
- sglang/srt/configs/mamba_utils.py +66 -0
- sglang/srt/configs/model_config.py +30 -7
- sglang/srt/constants.py +7 -0
- sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
- sglang/srt/disaggregation/decode.py +34 -6
- sglang/srt/disaggregation/nixl/conn.py +2 -2
- sglang/srt/disaggregation/prefill.py +25 -3
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
- sglang/srt/distributed/parallel_state.py +9 -12
- sglang/srt/entrypoints/engine.py +31 -20
- sglang/srt/entrypoints/grpc_server.py +0 -1
- sglang/srt/entrypoints/http_server.py +94 -94
- sglang/srt/entrypoints/openai/protocol.py +7 -1
- sglang/srt/entrypoints/openai/serving_chat.py +42 -0
- sglang/srt/entrypoints/openai/serving_completions.py +10 -0
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/environ.py +23 -2
- sglang/srt/eplb/expert_distribution.py +64 -1
- sglang/srt/eplb/expert_location.py +106 -36
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/minimax_m2.py +367 -0
- sglang/srt/grpc/compile_proto.py +3 -0
- sglang/srt/layers/activation.py +6 -0
- sglang/srt/layers/attention/ascend_backend.py +233 -5
- sglang/srt/layers/attention/attention_registry.py +3 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
- sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
- sglang/srt/layers/attention/fla/kda.py +1359 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
- sglang/srt/layers/attention/flashattention_backend.py +19 -8
- sglang/srt/layers/attention/flashinfer_backend.py +10 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -11
- sglang/srt/layers/attention/flashmla_backend.py +1 -1
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
- sglang/srt/layers/attention/mamba/mamba.py +20 -11
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
- sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
- sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
- sglang/srt/layers/attention/nsa/transform_index.py +1 -1
- sglang/srt/layers/attention/nsa_backend.py +157 -23
- sglang/srt/layers/attention/triton_backend.py +4 -1
- sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
- sglang/srt/layers/attention/trtllm_mla_backend.py +11 -15
- sglang/srt/layers/attention/utils.py +78 -0
- sglang/srt/layers/communicator.py +24 -1
- sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
- sglang/srt/layers/layernorm.py +35 -6
- sglang/srt/layers/logits_processor.py +9 -20
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
- sglang/srt/layers/moe/ep_moe/layer.py +78 -289
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
- sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +340 -55
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
- sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
- sglang/srt/layers/moe/token_dispatcher/deepep.py +25 -18
- sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
- sglang/srt/layers/moe/topk.py +35 -10
- sglang/srt/layers/moe/utils.py +3 -4
- sglang/srt/layers/pooler.py +21 -2
- sglang/srt/layers/quantization/__init__.py +13 -84
- sglang/srt/layers/quantization/auto_round.py +394 -0
- sglang/srt/layers/quantization/awq.py +0 -3
- sglang/srt/layers/quantization/base_config.py +7 -0
- sglang/srt/layers/quantization/fp8.py +68 -63
- sglang/srt/layers/quantization/fp8_kernel.py +1 -1
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/gguf.py +566 -0
- sglang/srt/layers/quantization/modelopt_quant.py +168 -11
- sglang/srt/layers/quantization/mxfp4.py +30 -38
- sglang/srt/layers/quantization/unquant.py +23 -45
- sglang/srt/layers/quantization/w4afp8.py +38 -2
- sglang/srt/layers/radix_attention.py +5 -2
- sglang/srt/layers/rotary_embedding.py +130 -46
- sglang/srt/layers/sampler.py +12 -1
- sglang/srt/lora/lora_registry.py +9 -0
- sglang/srt/managers/async_mm_data_processor.py +122 -0
- sglang/srt/managers/data_parallel_controller.py +30 -3
- sglang/srt/managers/detokenizer_manager.py +3 -0
- sglang/srt/managers/io_struct.py +29 -4
- sglang/srt/managers/multi_tokenizer_mixin.py +22 -1
- sglang/srt/managers/schedule_batch.py +74 -15
- sglang/srt/managers/scheduler.py +185 -144
- sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
- sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
- sglang/srt/managers/scheduler_pp_mixin.py +7 -2
- sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
- sglang/srt/managers/session_controller.py +6 -5
- sglang/srt/managers/tokenizer_manager.py +165 -78
- sglang/srt/managers/tp_worker.py +24 -1
- sglang/srt/mem_cache/base_prefix_cache.py +23 -4
- sglang/srt/mem_cache/common.py +1 -0
- sglang/srt/mem_cache/hicache_storage.py +7 -1
- sglang/srt/mem_cache/memory_pool.py +253 -57
- sglang/srt/mem_cache/memory_pool_host.py +12 -5
- sglang/srt/mem_cache/radix_cache.py +4 -0
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
- sglang/srt/metrics/collector.py +46 -3
- sglang/srt/model_executor/cuda_graph_runner.py +15 -3
- sglang/srt/model_executor/forward_batch_info.py +55 -14
- sglang/srt/model_executor/model_runner.py +77 -170
- sglang/srt/model_executor/npu_graph_runner.py +7 -3
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
- sglang/srt/model_loader/weight_utils.py +1 -1
- sglang/srt/models/bailing_moe.py +9 -2
- sglang/srt/models/deepseek_nextn.py +11 -2
- sglang/srt/models/deepseek_v2.py +296 -78
- sglang/srt/models/glm4.py +391 -77
- sglang/srt/models/glm4_moe.py +322 -354
- sglang/srt/models/glm4_moe_nextn.py +4 -14
- sglang/srt/models/glm4v.py +196 -55
- sglang/srt/models/glm4v_moe.py +29 -197
- sglang/srt/models/gpt_oss.py +1 -10
- sglang/srt/models/kimi_linear.py +678 -0
- sglang/srt/models/llama4.py +1 -1
- sglang/srt/models/llama_eagle3.py +11 -1
- sglang/srt/models/longcat_flash.py +2 -2
- sglang/srt/models/minimax_m2.py +922 -0
- sglang/srt/models/nvila.py +355 -0
- sglang/srt/models/nvila_lite.py +184 -0
- sglang/srt/models/qwen2.py +23 -2
- sglang/srt/models/qwen2_moe.py +30 -15
- sglang/srt/models/qwen3.py +35 -5
- sglang/srt/models/qwen3_moe.py +18 -12
- sglang/srt/models/qwen3_next.py +7 -0
- sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
- sglang/srt/multimodal/processors/base_processor.py +1 -0
- sglang/srt/multimodal/processors/glm4v.py +1 -1
- sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
- sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
- sglang/srt/multiplex/multiplexing_mixin.py +209 -0
- sglang/srt/multiplex/pdmux_context.py +164 -0
- sglang/srt/parser/conversation.py +7 -1
- sglang/srt/parser/reasoning_parser.py +28 -1
- sglang/srt/sampling/custom_logit_processor.py +67 -1
- sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
- sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
- sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
- sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
- sglang/srt/server_args.py +459 -199
- sglang/srt/single_batch_overlap.py +2 -4
- sglang/srt/speculative/draft_utils.py +16 -0
- sglang/srt/speculative/eagle_info.py +42 -36
- sglang/srt/speculative/eagle_info_v2.py +68 -25
- sglang/srt/speculative/eagle_utils.py +261 -16
- sglang/srt/speculative/eagle_worker.py +11 -3
- sglang/srt/speculative/eagle_worker_v2.py +15 -9
- sglang/srt/speculative/spec_info.py +305 -31
- sglang/srt/speculative/spec_utils.py +44 -8
- sglang/srt/tracing/trace.py +121 -12
- sglang/srt/utils/common.py +142 -74
- sglang/srt/utils/hf_transformers_utils.py +38 -12
- sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
- sglang/test/kits/radix_cache_server_kit.py +50 -0
- sglang/test/runners.py +31 -7
- sglang/test/simple_eval_common.py +5 -3
- sglang/test/simple_eval_humaneval.py +1 -0
- sglang/test/simple_eval_math.py +1 -0
- sglang/test/simple_eval_mmlu.py +1 -0
- sglang/test/simple_eval_mmmu_vlm.py +1 -0
- sglang/test/test_deterministic.py +235 -12
- sglang/test/test_deterministic_utils.py +2 -1
- sglang/test/test_utils.py +7 -1
- sglang/version.py +1 -1
- {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +15 -28
- {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +194 -175
- sglang/srt/models/vila.py +0 -306
- /sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
- {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
- {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0
|
@@ -14,7 +14,13 @@ from sglang.srt.managers.io_struct import (
|
|
|
14
14
|
BatchEmbeddingOutput,
|
|
15
15
|
BatchTokenIDOutput,
|
|
16
16
|
)
|
|
17
|
-
from sglang.srt.managers.schedule_batch import
|
|
17
|
+
from sglang.srt.managers.schedule_batch import (
|
|
18
|
+
BaseFinishReason,
|
|
19
|
+
Req,
|
|
20
|
+
RequestStage,
|
|
21
|
+
ScheduleBatch,
|
|
22
|
+
)
|
|
23
|
+
from sglang.srt.tracing.trace import trace_slice
|
|
18
24
|
from sglang.srt.utils.common import ceil_div
|
|
19
25
|
|
|
20
26
|
if TYPE_CHECKING:
|
|
@@ -160,6 +166,14 @@ class SchedulerOutputProcessorMixin:
|
|
|
160
166
|
)
|
|
161
167
|
self.abort_request(AbortReq(rid=req.rid))
|
|
162
168
|
req.grammar.finished = req.finished()
|
|
169
|
+
|
|
170
|
+
trace_slice(
|
|
171
|
+
RequestStage.PREFILL_FORWARD,
|
|
172
|
+
req.rid,
|
|
173
|
+
auto_next_anon=not req.finished(),
|
|
174
|
+
thread_finish_flag=req.finished(),
|
|
175
|
+
)
|
|
176
|
+
|
|
163
177
|
else:
|
|
164
178
|
# being chunked reqs' prefill is not finished
|
|
165
179
|
req.is_chunked -= 1
|
|
@@ -188,6 +202,12 @@ class SchedulerOutputProcessorMixin:
|
|
|
188
202
|
)
|
|
189
203
|
logprob_pt += num_input_logprobs
|
|
190
204
|
|
|
205
|
+
trace_slice(
|
|
206
|
+
RequestStage.PREFILL_CHUNKED_FORWARD,
|
|
207
|
+
req.rid,
|
|
208
|
+
auto_next_anon=True,
|
|
209
|
+
)
|
|
210
|
+
|
|
191
211
|
else: # embedding or reward model
|
|
192
212
|
is_sparse = envs.SGLANG_EMBEDDINGS_SPARSE_HEAD.is_set()
|
|
193
213
|
|
|
@@ -203,7 +223,10 @@ class SchedulerOutputProcessorMixin:
|
|
|
203
223
|
i
|
|
204
224
|
].item()
|
|
205
225
|
else:
|
|
206
|
-
embeddings
|
|
226
|
+
if isinstance(embeddings, torch.Tensor):
|
|
227
|
+
embeddings = embeddings.tolist()
|
|
228
|
+
else:
|
|
229
|
+
embeddings = [tensor.tolist() for tensor in embeddings]
|
|
207
230
|
|
|
208
231
|
# Check finish conditions
|
|
209
232
|
for i, req in enumerate(batch.reqs):
|
|
@@ -224,6 +247,13 @@ class SchedulerOutputProcessorMixin:
|
|
|
224
247
|
# being chunked reqs' prefill is not finished
|
|
225
248
|
req.is_chunked -= 1
|
|
226
249
|
|
|
250
|
+
trace_slice(
|
|
251
|
+
RequestStage.PREFILL_FORWARD,
|
|
252
|
+
req.rid,
|
|
253
|
+
auto_next_anon=not req.finished(),
|
|
254
|
+
thread_finish_flag=req.finished(),
|
|
255
|
+
)
|
|
256
|
+
|
|
227
257
|
self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)
|
|
228
258
|
|
|
229
259
|
def _resolve_spec_overlap_token_ids(
|
|
@@ -727,6 +757,7 @@ class SchedulerOutputProcessorMixin:
|
|
|
727
757
|
cached_tokens = []
|
|
728
758
|
spec_verify_ct = []
|
|
729
759
|
spec_accepted_tokens = []
|
|
760
|
+
retraction_counts = []
|
|
730
761
|
output_hidden_states = None
|
|
731
762
|
|
|
732
763
|
if return_logprob:
|
|
@@ -758,7 +789,7 @@ class SchedulerOutputProcessorMixin:
|
|
|
758
789
|
continue
|
|
759
790
|
|
|
760
791
|
# Multimodal partial stream chunks break the detokenizer, so drop aborted requests here.
|
|
761
|
-
if self.model_config.is_multimodal_gen and req.
|
|
792
|
+
if self.model_config.is_multimodal_gen and req.to_finish:
|
|
762
793
|
continue
|
|
763
794
|
|
|
764
795
|
if req.finished():
|
|
@@ -828,6 +859,8 @@ class SchedulerOutputProcessorMixin:
|
|
|
828
859
|
completion_tokens.append(len(output_ids_))
|
|
829
860
|
cached_tokens.append(req.cached_tokens)
|
|
830
861
|
|
|
862
|
+
retraction_counts.append(req.retraction_count)
|
|
863
|
+
|
|
831
864
|
if not self.spec_algorithm.is_none():
|
|
832
865
|
spec_verify_ct.append(req.spec_verify_ct)
|
|
833
866
|
spec_accepted_tokens.append(req.spec_accepted_tokens)
|
|
@@ -950,6 +983,7 @@ class SchedulerOutputProcessorMixin:
|
|
|
950
983
|
http_worker_ipcs=http_worker_ipcs,
|
|
951
984
|
placeholder_tokens_idx=None,
|
|
952
985
|
placeholder_tokens_val=None,
|
|
986
|
+
retraction_counts=retraction_counts,
|
|
953
987
|
)
|
|
954
988
|
)
|
|
955
989
|
|
|
@@ -961,6 +995,7 @@ class SchedulerOutputProcessorMixin:
|
|
|
961
995
|
embeddings = []
|
|
962
996
|
prompt_tokens = []
|
|
963
997
|
cached_tokens = []
|
|
998
|
+
retraction_counts = []
|
|
964
999
|
for req in reqs:
|
|
965
1000
|
if req.finished():
|
|
966
1001
|
rids.append(req.rid)
|
|
@@ -969,6 +1004,7 @@ class SchedulerOutputProcessorMixin:
|
|
|
969
1004
|
embeddings.append(req.embedding)
|
|
970
1005
|
prompt_tokens.append(len(req.origin_input_ids))
|
|
971
1006
|
cached_tokens.append(req.cached_tokens)
|
|
1007
|
+
retraction_counts.append(req.retraction_count)
|
|
972
1008
|
self.send_to_detokenizer.send_output(
|
|
973
1009
|
BatchEmbeddingOutput(
|
|
974
1010
|
finished_reasons,
|
|
@@ -979,5 +1015,6 @@ class SchedulerOutputProcessorMixin:
|
|
|
979
1015
|
http_worker_ipcs=http_worker_ipcs,
|
|
980
1016
|
placeholder_tokens_idx=None,
|
|
981
1017
|
placeholder_tokens_val=None,
|
|
1018
|
+
retraction_counts=retraction_counts,
|
|
982
1019
|
)
|
|
983
1020
|
)
|
|
@@ -4,7 +4,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
|
|
4
4
|
from sglang.srt.managers.schedule_batch import ScheduleBatch
|
|
5
5
|
from sglang.srt.managers.utils import GenerationBatchResult
|
|
6
6
|
from sglang.srt.model_executor.forward_batch_info import PPProxyTensors
|
|
7
|
-
from sglang.srt.utils import DynamicGradMode, point_to_point_pyobj
|
|
7
|
+
from sglang.srt.utils import DynamicGradMode, point_to_point_pyobj, require_mlp_sync
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class SchedulerPPMixin:
|
|
@@ -236,7 +236,12 @@ class SchedulerPPMixin:
|
|
|
236
236
|
tmbs[mb_id] = transferred_rids
|
|
237
237
|
|
|
238
238
|
self.process_prefill_chunk()
|
|
239
|
-
|
|
239
|
+
|
|
240
|
+
batch = self.get_new_batch_prefill()
|
|
241
|
+
if require_mlp_sync(self.server_args):
|
|
242
|
+
batch = self.prepare_mlp_sync_batch(batch)
|
|
243
|
+
mbs[mb_id] = batch
|
|
244
|
+
|
|
240
245
|
self.running_mbs[mb_id] = self.running_batch
|
|
241
246
|
|
|
242
247
|
self.cur_batch = mbs[mb_id]
|
|
@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
|
|
|
28
28
|
class SchedulerProfilerMixin:
|
|
29
29
|
def init_profiler(self):
|
|
30
30
|
self.torch_profiler = None
|
|
31
|
-
self.torch_profiler_output_dir: Optional[
|
|
31
|
+
self.torch_profiler_output_dir: Optional[Path] = None
|
|
32
32
|
self.profiler_activities: Optional[List[str]] = None
|
|
33
33
|
self.profile_id: Optional[str] = None
|
|
34
34
|
self.profiler_start_forward_ct: Optional[int] = None
|
|
@@ -69,7 +69,7 @@ class SchedulerProfilerMixin:
|
|
|
69
69
|
if activities is None:
|
|
70
70
|
activities = ["CPU", "GPU"]
|
|
71
71
|
|
|
72
|
-
self.torch_profiler_output_dir = output_dir
|
|
72
|
+
self.torch_profiler_output_dir = Path(output_dir).expanduser()
|
|
73
73
|
self.torch_profiler_with_stack = with_stack
|
|
74
74
|
self.torch_profiler_record_shapes = record_shapes
|
|
75
75
|
self.profiler_activities = activities
|
|
@@ -213,8 +213,7 @@ class SchedulerProfilerMixin:
|
|
|
213
213
|
message="Profiling is not in progress. Call /start_profile first.",
|
|
214
214
|
)
|
|
215
215
|
|
|
216
|
-
|
|
217
|
-
Path(self.torch_profiler_output_dir).mkdir(parents=True, exist_ok=True)
|
|
216
|
+
self.torch_profiler_output_dir.mkdir(parents=True, exist_ok=True)
|
|
218
217
|
|
|
219
218
|
stage_suffix = f"-{stage.name}" if stage else ""
|
|
220
219
|
logger.info("Stop profiling" + stage_suffix + "...")
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
4
|
+
import signal
|
|
5
|
+
import sys
|
|
3
6
|
import time
|
|
4
7
|
from typing import TYPE_CHECKING
|
|
5
8
|
|
|
@@ -7,10 +10,13 @@ from sglang.srt.disaggregation.utils import DisaggregationMode
|
|
|
7
10
|
from sglang.srt.managers.schedule_batch import ScheduleBatch
|
|
8
11
|
from sglang.srt.mem_cache.mamba_radix_cache import MambaRadixCache
|
|
9
12
|
from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
|
|
13
|
+
from sglang.srt.utils.common import disable_request_logging, pyspy_dump_schedulers
|
|
10
14
|
|
|
11
15
|
if TYPE_CHECKING:
|
|
12
16
|
from sglang.srt.managers.scheduler import Scheduler
|
|
13
17
|
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
14
20
|
|
|
15
21
|
class SchedulerRuntimeCheckerMixin:
|
|
16
22
|
|
|
@@ -215,3 +221,42 @@ class SchedulerRuntimeCheckerMixin:
|
|
|
215
221
|
self.check_tree_cache()
|
|
216
222
|
self.new_token_ratio = self.init_new_token_ratio
|
|
217
223
|
self.maybe_sleep_on_idle()
|
|
224
|
+
|
|
225
|
+
def watchdog_thread(self: Scheduler):
|
|
226
|
+
"""A watch dog thread that will try to kill the server itself if one forward batch takes too long."""
|
|
227
|
+
self.watchdog_last_forward_ct = 0
|
|
228
|
+
self.watchdog_last_time = time.perf_counter()
|
|
229
|
+
|
|
230
|
+
while True:
|
|
231
|
+
current = time.perf_counter()
|
|
232
|
+
if self.cur_batch is not None:
|
|
233
|
+
if self.watchdog_last_forward_ct == self.forward_ct:
|
|
234
|
+
if current > self.watchdog_last_time + self.watchdog_timeout:
|
|
235
|
+
break
|
|
236
|
+
else:
|
|
237
|
+
self.watchdog_last_forward_ct = self.forward_ct
|
|
238
|
+
self.watchdog_last_time = current
|
|
239
|
+
time.sleep(self.watchdog_timeout // 2)
|
|
240
|
+
|
|
241
|
+
if not disable_request_logging():
|
|
242
|
+
# Print batch size and memory pool info to check whether there are de-sync issues.
|
|
243
|
+
if self.is_hybrid:
|
|
244
|
+
_, info_msg = self._check_hybrid_memory()
|
|
245
|
+
elif self.is_hybrid_gdn and isinstance(self.tree_cache, MambaRadixCache):
|
|
246
|
+
_, info_msg = self._check_mamba_memory()
|
|
247
|
+
else:
|
|
248
|
+
_, info_msg = self._check_radix_cache_memory()
|
|
249
|
+
logger.error(
|
|
250
|
+
f"{self.cur_batch.batch_size()=}\n"
|
|
251
|
+
f"{self.cur_batch.reqs=}\n"
|
|
252
|
+
f"{info_msg}"
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
pyspy_dump_schedulers()
|
|
256
|
+
logger.error(f"Watchdog timeout ({self.watchdog_timeout=})")
|
|
257
|
+
print(file=sys.stderr, flush=True)
|
|
258
|
+
print(file=sys.stdout, flush=True)
|
|
259
|
+
|
|
260
|
+
# Wait for some time so that the parent process can print the error.
|
|
261
|
+
time.sleep(5)
|
|
262
|
+
self.parent_process.send_signal(signal.SIGQUIT)
|
|
@@ -5,7 +5,12 @@ from typing import TYPE_CHECKING, Tuple
|
|
|
5
5
|
|
|
6
6
|
import torch
|
|
7
7
|
|
|
8
|
-
from sglang.srt.constants import
|
|
8
|
+
from sglang.srt.constants import (
|
|
9
|
+
GPU_MEMORY_ALL_TYPES,
|
|
10
|
+
GPU_MEMORY_TYPE_CUDA_GRAPH,
|
|
11
|
+
GPU_MEMORY_TYPE_KV_CACHE,
|
|
12
|
+
GPU_MEMORY_TYPE_WEIGHTS,
|
|
13
|
+
)
|
|
9
14
|
from sglang.srt.managers.io_struct import (
|
|
10
15
|
DestroyWeightsUpdateGroupReqInput,
|
|
11
16
|
DestroyWeightsUpdateGroupReqOutput,
|
|
@@ -101,10 +106,14 @@ class SchedulerUpdateWeightsMixin:
|
|
|
101
106
|
def release_memory_occupation(
|
|
102
107
|
self: Scheduler, recv_req: ReleaseMemoryOccupationReqInput
|
|
103
108
|
):
|
|
109
|
+
assert (
|
|
110
|
+
self._is_no_request()
|
|
111
|
+
), "release_memory_occupation should be called only when no ongoing request."
|
|
112
|
+
|
|
104
113
|
tags = recv_req.tags
|
|
105
114
|
|
|
106
115
|
if tags is None or len(tags) == 0:
|
|
107
|
-
tags =
|
|
116
|
+
tags = GPU_MEMORY_ALL_TYPES
|
|
108
117
|
|
|
109
118
|
for tag in tags:
|
|
110
119
|
self.offload_tags.add(tag)
|
|
@@ -120,6 +129,9 @@ class SchedulerUpdateWeightsMixin:
|
|
|
120
129
|
torch.distributed.barrier(self.tp_cpu_group)
|
|
121
130
|
self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_WEIGHTS)
|
|
122
131
|
|
|
132
|
+
if GPU_MEMORY_TYPE_CUDA_GRAPH in tags:
|
|
133
|
+
self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_CUDA_GRAPH)
|
|
134
|
+
|
|
123
135
|
return ReleaseMemoryOccupationReqOutput()
|
|
124
136
|
|
|
125
137
|
def resume_memory_occupation(
|
|
@@ -128,11 +140,14 @@ class SchedulerUpdateWeightsMixin:
|
|
|
128
140
|
tags = recv_req.tags
|
|
129
141
|
|
|
130
142
|
if tags is None or len(tags) == 0:
|
|
131
|
-
tags =
|
|
143
|
+
tags = GPU_MEMORY_ALL_TYPES
|
|
132
144
|
|
|
133
145
|
for tag in tags:
|
|
134
146
|
self.offload_tags.remove(tag)
|
|
135
147
|
|
|
148
|
+
if GPU_MEMORY_TYPE_CUDA_GRAPH in tags:
|
|
149
|
+
self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_CUDA_GRAPH)
|
|
150
|
+
|
|
136
151
|
if GPU_MEMORY_TYPE_WEIGHTS in tags:
|
|
137
152
|
self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_WEIGHTS)
|
|
138
153
|
torch.distributed.barrier(self.tp_cpu_group)
|
|
@@ -15,11 +15,11 @@ import uuid
|
|
|
15
15
|
from typing import Dict, Optional
|
|
16
16
|
|
|
17
17
|
from sglang.srt.managers.io_struct import TokenizedGenerateReqInput
|
|
18
|
-
from sglang.srt.managers.schedule_batch import Req
|
|
18
|
+
from sglang.srt.managers.schedule_batch import FINISH_ABORT, Req
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class SessionReqNode:
|
|
22
|
-
def __init__(self, req, parent=None, childs=None):
|
|
22
|
+
def __init__(self, req: Req, parent=None, childs=None):
|
|
23
23
|
self.req = req
|
|
24
24
|
self.parent = parent
|
|
25
25
|
if parent is not None:
|
|
@@ -36,12 +36,12 @@ class SessionReqNode:
|
|
|
36
36
|
req_node.clear(req_dict)
|
|
37
37
|
|
|
38
38
|
if self.req.finished_reason is None:
|
|
39
|
-
self.req.
|
|
39
|
+
self.req.to_finish = FINISH_ABORT()
|
|
40
40
|
del req_dict[self.req.rid]
|
|
41
41
|
|
|
42
42
|
def abort(self):
|
|
43
43
|
if self.req.finished_reason is None:
|
|
44
|
-
self.req.
|
|
44
|
+
self.req.to_finish = FINISH_ABORT()
|
|
45
45
|
|
|
46
46
|
def __str__(self):
|
|
47
47
|
return self._str_helper(self.req.rid)
|
|
@@ -137,13 +137,14 @@ class Session:
|
|
|
137
137
|
origin_input_ids=input_ids,
|
|
138
138
|
origin_input_ids_unpadded=input_ids_unpadded,
|
|
139
139
|
sampling_params=req.sampling_params,
|
|
140
|
-
|
|
140
|
+
lora_id=req.lora_id,
|
|
141
141
|
session_id=self.session_id,
|
|
142
142
|
custom_logit_processor=req.custom_logit_processor,
|
|
143
143
|
stream=req.stream,
|
|
144
144
|
return_logprob=req.return_logprob,
|
|
145
145
|
top_logprobs_num=req.top_logprobs_num,
|
|
146
146
|
token_ids_logprob=req.token_ids_logprob,
|
|
147
|
+
vocab_size=tokenizer.vocab_size,
|
|
147
148
|
)
|
|
148
149
|
if last_req is not None:
|
|
149
150
|
new_req.multimodal_inputs = last_req.multimodal_inputs
|