PyPI - sglang - Versions diffs - 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +18 -3
sglang/compile_deep_gemm.py +13 -7
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +120 -0
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +25 -2
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -5
sglang/srt/entrypoints/engine.py +13 -5
sglang/srt/entrypoints/http_server.py +22 -3
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +7 -0
sglang/srt/eplb/expert_distribution.py +34 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +7 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +3 -1
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +10 -2
sglang/srt/layers/communicator.py +23 -1
sglang/srt/layers/layernorm.py +16 -2
sglang/srt/layers/logits_processor.py +4 -20
sglang/srt/layers/moe/ep_moe/layer.py +0 -18
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/moe_runner/deep_gemm.py +53 -33
sglang/srt/layers/moe/token_dispatcher/deepep.py +12 -9
sglang/srt/layers/moe/topk.py +31 -6
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +9 -78
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/rotary_embedding.py +117 -45
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +26 -4
sglang/srt/managers/multi_tokenizer_mixin.py +5 -0
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +164 -129
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +154 -59
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/memory_pool.py +171 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +11 -11
sglang/srt/model_executor/model_runner.py +76 -21
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +149 -34
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +0 -1
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +1 -1
sglang/srt/models/qwen3_moe.py +16 -8
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +103 -22
sglang/srt/single_batch_overlap.py +4 -1
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +55 -32
sglang/srt/utils/hf_transformers_utils.py +38 -16
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +10 -24
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +150 -136
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -205,6 +205,14 @@ class ModelConfig:
             self.hf_config, "image_token_id", None
         ) or getattr(self.hf_config, "image_token_index", None)
+        # matryoshka embeddings
+        self.matryoshka_dimensions = getattr(
+            self.hf_config, "matryoshka_dimensions", None
+        )
+        self.is_matryoshka = self.matryoshka_dimensions or getattr(
+            self.hf_config, "is_matryoshka", False
+        )
     @staticmethod
     def from_server_args(
         server_args: ServerArgs,
@@ -358,6 +366,13 @@ class ModelConfig:
             self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
             self.v_head_dim = self.hf_text_config.v_head_dim
             self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
+        elif "KimiLinearForCausalLM" in self.hf_config.architectures:
+            self.head_dim = 72
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_config.kv_lora_rank
+            self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
+            self.v_head_dim = self.hf_config.v_head_dim
+            self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
         else:
             if (
                 "MistralModel" in self.hf_config.architectures
@@ -582,14 +597,20 @@ class ModelConfig:
             return
         # Check if ModelOpt quantization is specified
-        modelopt_quantization_specified = self.quantization in [
+        _MODELOPT_QUANTIZATION_METHODS = [
             "modelopt",
             "modelopt_fp8",
             "modelopt_fp4",
         ]
+        modelopt_quantization_specified = (
+            self.quantization in _MODELOPT_QUANTIZATION_METHODS
+        )
         if not modelopt_quantization_specified:
-            raise ValueError("quantize_and_serve requires ModelOpt quantization")
+            raise ValueError(
+                "quantize_and_serve requires ModelOpt quantization (set with --quantization "
+                f"{{{', '.join(sorted(_MODELOPT_QUANTIZATION_METHODS))}}})"
+            )
         # quantize_and_serve is disabled due to compatibility issues
         raise NotImplementedError(
@@ -613,6 +634,7 @@ class ModelConfig:
             "petit_nvfp4",
             "quark",
             "mxfp4",
+            "auto-round",
         ]
         optimized_quantization_methods = [
             "fp8",
@@ -634,6 +656,7 @@ class ModelConfig:
             "petit_nvfp4",
         ]
         compatible_quantization_methods = {
+            "modelopt_fp8": ["modelopt"],
             "modelopt_fp4": ["modelopt"],
             "petit_nvfp4": ["modelopt"],
             "w8a8_int8": ["compressed-tensors", "compressed_tensors"],

sglang/srt/constants.py CHANGED Viewed

@@ -1,3 +1,10 @@
 # GPU Memory Types
 GPU_MEMORY_TYPE_KV_CACHE = "kv_cache"
 GPU_MEMORY_TYPE_WEIGHTS = "weights"
+GPU_MEMORY_TYPE_CUDA_GRAPH = "cuda_graph"
+GPU_MEMORY_ALL_TYPES = [
+    GPU_MEMORY_TYPE_KV_CACHE,
+    GPU_MEMORY_TYPE_WEIGHTS,
+    GPU_MEMORY_TYPE_CUDA_GRAPH,
+]

sglang/srt/debug_utils/tensor_dump_forward_hook.py ADDED Viewed

@@ -0,0 +1,149 @@
+"""
+This file provides a function `register_forward_hook_for_model` that registers a forward hook on every operator of the model.
+After registration, during model inference, all tensors generated throughout the forward pass will be recorded.
+Usage:
+Specify the output directory for dumping tensors using the argument `--debug-tensor-dump-output-folder`.
+A separate directory will be created for each GPU rank, named in the format `f"TP{tp_rank}_PP{pp_rank}_Rank{rank}_pid{pid}"`.
+Each complete forward pass of the model generates a `.pt` file named `f"Pass{pass_num}.pt"`, which can be loaded using `torch.load`.
+The file contains a series of key-value pairs, where the keys correspond to operator names in the model
+(similar to those in model.safetensors.index.json), and the values are the outputs produced by the respective operators.
+"""
+import logging
+import os
+from pathlib import Path
+import torch
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+logger = logging.getLogger(__name__)
+class TensorDumper:
+    def __init__(
+        self, dump_dir: str, dump_layers: int, tp_size: int, tp_rank: int, pp_rank: int
+    ):
+        self._dump_layers = dump_layers
+        self._forward_pass_id = 0
+        self._pid = os.getpid()
+        self._current_tensors = {}
+        self._base_dir = Path(dump_dir)
+        rank = tp_size * pp_rank + tp_rank
+        self._process_dir = (
+            self._base_dir / f"TP{tp_rank}_PP{pp_rank}_Rank{rank}_pid{self._pid}"
+        )
+        self._process_dir.mkdir(parents=True, exist_ok=True)
+    def get_dump_dir(self):
+        return str(self._process_dir)
+    def add_tensor(self, name, tensor_item):
+        if isinstance(tensor_item, (tuple, list)):
+            tensors = [t.cpu() for t in tensor_item if t is not None]
+            if len(tensors) == 1:
+                self._current_tensors[name] = tensors[0]
+            else:
+                self._current_tensors[name] = tensors
+        elif isinstance(tensor_item, torch.Tensor):
+            self._current_tensors[name] = tensor_item.cpu()
+        elif isinstance(tensor_item, LogitsProcessorOutput):
+            self._current_tensors[name] = tensor_item.next_token_logits.cpu()
+        elif isinstance(tensor_item, ForwardBatch):
+            self._current_tensors[name + ".forward_batch_info.input_ids"] = (
+                tensor_item.input_ids.cpu()
+            )
+            self._current_tensors[name + ".forward_batch_info.seq_lens"] = (
+                tensor_item.seq_lens.cpu()
+            )
+            self._current_tensors[name + ".forward_batch_info.positions"] = (
+                tensor_item.positions.cpu()
+            )
+        elif isinstance(tensor_item, PPProxyTensors):
+            for tensor_name in tensor_item.tensors.keys():
+                self._current_tensors[name + ".pp_proxy_tensors." + tensor_name] = (
+                    tensor_item.tensors[tensor_name].cpu()
+                )
+        else:
+            logger.warning(f"Unsupported type: {type(tensor_item)}: {tensor_item}")
+    def dump_current_tensors(self):
+        if len(self._current_tensors) == 0:
+            return
+        tensor_file_for_pass = self._process_dir / f"Pass{self._forward_pass_id:05d}.pt"
+        logger.info(
+            f"Dump {self._forward_pass_id:05d}th pass to {tensor_file_for_pass}"
+        )
+        torch.save(self._current_tensors, str(tensor_file_for_pass))
+        self._current_tensors = {}
+        self._forward_pass_id += 1
+    def _add_hook_recursive(
+        self, model, prefix, top_level_module_name, layers_module_name
+    ):
+        model_top_level_module_matched = False
+        layers_prefix = top_level_module_name + "." + layers_module_name
+        for name, module in model._modules.items():
+            top_level_model = False
+            if len(prefix) == 0:
+                cur_name = name
+                if cur_name == top_level_module_name:
+                    model_top_level_module_matched = True
+                    top_level_model = True
+            else:
+                cur_name = prefix + "." + name
+            if self._dump_layers > 0 and name.isdigit() and prefix == layers_prefix:
+                # If we only need n layers, skip the reset layers.
+                # Most models' layout is like model.layers.0.
+                cur_layer = int(name)
+                if cur_layer >= self._dump_layers:
+                    continue
+            if module is not None:
+                _, sub_count = self._add_hook_recursive(
+                    module, cur_name, top_level_module_name, layers_module_name
+                )
+                if sub_count == 0 or top_level_model:
+                    # Avoid duplicated output hooks, e.g. self_attn may contain:
+                    # self_attn.qkv_proj, self_attn.attn & self_attn.o_proj.
+                    # Therefore, we do not need to add output hooks for self_attn,
+                    # since the output of self_attn should be the same to self_attn.o_proj.
+                    module.register_forward_hook(
+                        self._dump_hook(cur_name, top_level_model)
+                    )
+        return model_top_level_module_matched, len(model._modules.items())
+    def _dump_hook(self, tensor_name, do_dump):
+        def inner_dump_hook(module, input, output):
+            if do_dump:
+                # This is the top-level model, so we will record the input for it.
+                for item in input:
+                    if isinstance(item, ForwardBatch):
+                        self.add_tensor(tensor_name, item)
+                self.dump_current_tensors()
+            if output is not None:
+                self.add_tensor(tensor_name, output)
+        return inner_dump_hook
+def register_forward_hook_for_model(
+    model, dump_dir: str, dump_layers: int, tp_size: int, tp_rank: int, pp_rank: int
+):
+    tensor_dumper = TensorDumper(dump_dir, dump_layers, tp_size, tp_rank, pp_rank)
+    # Most models have the layerout like:
+    # XxxxForCausalLM
+    #     (model): XxxxModel
+    #         (layers): ModuleList
+    # If the model is not constructed with this layout,
+    # environment variable can be used to specify the module names.
+    top_level_module_name = os.getenv("TENSOR_DUMP_TOP_LEVEL_MODULE_NAME", "model")
+    layers_module_name = os.getenv("TENSOR_DUMP_LAYERS_MODULE_NAME", "layers")
+    model_top_level_module_matched, _ = tensor_dumper._add_hook_recursive(
+        model, "", top_level_module_name, layers_module_name
+    )
+    assert (
+        model_top_level_module_matched
+    ), f"model should have a module named {top_level_module_name}"
+    return tensor_dumper

sglang/srt/disaggregation/decode.py CHANGED Viewed

@@ -58,6 +58,11 @@ from sglang.srt.mem_cache.memory_pool import (
     ReqToTokenPool,
     SWAKVPool,
 )
+from sglang.srt.tracing.trace import (
+    trace_event_batch,
+    trace_slice_batch,
+    trace_slice_end,
+)
 from sglang.srt.utils import get_int_env_var, require_mlp_sync
 from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
@@ -313,6 +318,7 @@ class DecodePreallocQueue:
             )
             req.add_latency(RequestStage.DECODE_PREPARE)
+            trace_slice_end(RequestStage.DECODE_PREPARE, req.rid, auto_next_anon=True)
             self.queue.append(
                 DecodeRequest(req=req, kv_receiver=kv_receiver, waiting_for_input=False)
             )
@@ -521,13 +527,15 @@ class DecodePreallocQueue:
             decode_req.kv_receiver.init(
                 page_indices, decode_req.metadata_buffer_index, state_indices
             )
-            decode_req.req.add_latency(RequestStage.DECODE_BOOTSTRAP)
             preallocated_reqs.append(decode_req)
             indices_to_remove.add(i)
             decode_req.req.time_stats.decode_transfer_queue_entry_time = (
                 time.perf_counter()
             )
             decode_req.req.add_latency(RequestStage.DECODE_BOOTSTRAP)
+            trace_slice_end(
+                RequestStage.DECODE_BOOTSTRAP, decode_req.req.rid, auto_next_anon=True
+            )
         self.queue = [
             entry for i, entry in enumerate(self.queue) if i not in indices_to_remove
@@ -765,8 +773,12 @@ class DecodeTransferQueue:
                 indices_to_remove.add(i)
                 decode_req.req.time_stats.wait_queue_entry_time = time.perf_counter()
-                # special handling for sampling_params.max_new_tokens == 1
-                if decode_req.req.sampling_params.max_new_tokens == 1:
+                # special handling for corner cases
+                should_finish = (
+                    decode_req.req.sampling_params.max_new_tokens == 1
+                    or output_id in decode_req.req.eos_token_ids
+                )
+                if should_finish:
                     # finish immediately
                     decode_req.req.time_stats.forward_entry_time = (
                         decode_req.req.time_stats.completion_time
@@ -776,8 +788,19 @@ class DecodeTransferQueue:
                         [decode_req.req], decode_req.req.return_logprob
                     )
                     self.tree_cache.cache_finished_req(decode_req.req)
+                    trace_slice_end(
+                        RequestStage.DECODE_QUICK_FINISH,
+                        decode_req.req.rid,
+                        thread_finish_flag=True,
+                    )
                 else:
                     transferred_reqs.append(decode_req.req)
+                    trace_slice_end(
+                        RequestStage.DECODE_TRANSFERRED,
+                        decode_req.req.rid,
+                        auto_next_anon=True,
+                    )
             elif poll in [
                 KVPoll.Bootstrapping,
                 KVPoll.WaitingForInput,
@@ -823,6 +846,7 @@ class SchedulerDisaggregationDecodeMixin:
                     self.stream_output(
                         batch.reqs, any(req.return_logprob for req in batch.reqs)
                     )
+                    trace_slice_batch(RequestStage.DECODE_FAKE_OUTPUT, batch.reqs)
                     if prepare_mlp_sync_flag:
                         self._prepare_idle_batch_and_run(None)
                 else:
@@ -872,6 +896,7 @@ class SchedulerDisaggregationDecodeMixin:
                     self.stream_output(
                         batch.reqs, any(req.return_logprob for req in batch.reqs)
                     )
+                    trace_slice_batch(RequestStage.DECODE_FAKE_OUTPUT, batch.reqs)
                     if prepare_mlp_sync_flag:
                         batch_, batch_result = self._prepare_idle_batch_and_run(
                             None, delay_process=True
@@ -954,6 +979,9 @@ class SchedulerDisaggregationDecodeMixin:
                 self.running_batch = self.update_running_batch(self.running_batch)
                 ret = self.running_batch if not self.running_batch.is_empty() else None
+        if ret:
+            attrs = {"bid": hex(id(ret)), "batch_size": ret.batch_size()}
+            trace_event_batch("schedule", ret.reqs, attrs=attrs)
         return ret
     def get_new_prebuilt_batch(self: Scheduler) -> Optional[ScheduleBatch]:
@@ -1009,6 +1037,9 @@ class SchedulerDisaggregationDecodeMixin:
         return new_batch
     def process_decode_queue(self: Scheduler):
+        if self.server_args.disaggregation_decode_enable_offload_kvcache:
+            self.decode_offload_manager.check_offload_progress()
         # try to resume retracted requests if there are enough space for another `num_reserved_decode_tokens` decode steps
         resumed_reqs = self.disagg_decode_prealloc_queue.resume_retracted_reqs()
         self.waiting_queue.extend(resumed_reqs)
@@ -1031,6 +1062,3 @@ class SchedulerDisaggregationDecodeMixin:
                 self.disagg_decode_transfer_queue.pop_transferred()
             )  # the requests which kv has arrived
             self.waiting_queue.extend(alloc_reqs)
-        if self.server_args.disaggregation_decode_enable_offload_kvcache:
-            self.decode_offload_manager.check_offload_progress()

sglang/srt/disaggregation/nixl/conn.py CHANGED Viewed

@@ -231,8 +231,8 @@ class NixlKVManager(CommonKVManager):
             ]
             for k in keys_to_remove:
                 del self.connection_pool[k]
-            if failed_bootstrap_addr in self.prefill_tp_size_table:
-                del self.prefill_tp_size_table[failed_bootstrap_addr]
+            if failed_bootstrap_addr in self.prefill_attn_tp_size_table:
+                del self.prefill_attn_tp_size_table[failed_bootstrap_addr]
             if failed_bootstrap_addr in self.prefill_dp_size_table:
                 del self.prefill_dp_size_table[failed_bootstrap_addr]
             if failed_bootstrap_addr in self.prefill_pp_size_table:

sglang/srt/disaggregation/prefill.py CHANGED Viewed

@@ -53,6 +53,7 @@ from sglang.srt.mem_cache.memory_pool import (
     NSATokenToKVPool,
     SWAKVPool,
 )
+from sglang.srt.tracing.trace import trace_event_batch, trace_slice, trace_slice_end
 from sglang.srt.utils import broadcast_pyobj, point_to_point_pyobj, require_mlp_sync
 if TYPE_CHECKING:
@@ -198,6 +199,7 @@ class PrefillBootstrapQueue:
         self._process_req(req)
         req.add_latency(RequestStage.PREFILL_PREPARE)
         self.queue.append(req)
+        trace_slice_end(RequestStage.PREFILL_PREPARE, req.rid, auto_next_anon=True)
     def extend(self, reqs: List[Req], num_kv_heads: int) -> None:
         for req in reqs:
@@ -289,6 +291,10 @@ class PrefillBootstrapQueue:
             req.time_stats.wait_queue_entry_time = time.perf_counter()
             req.add_latency(RequestStage.PREFILL_BOOTSTRAP)
+            trace_slice_end(
+                RequestStage.PREFILL_BOOTSTRAP, req.rid, auto_next_anon=True
+            )
         self.queue = [
             entry for i, entry in enumerate(self.queue) if i not in indices_to_remove
         ]
@@ -316,6 +322,9 @@ class SchedulerDisaggregationPrefillMixin:
             )
             self.process_prefill_chunk()
             batch = self.get_new_batch_prefill()
+            if batch:
+                attrs = {"bid": hex(id(batch)), "batch_size": batch.batch_size()}
+                trace_event_batch("schedule", batch.reqs, attrs=attrs)
             if require_mlp_sync(self.server_args):
                 batch = self.prepare_mlp_sync_batch(batch)
@@ -348,6 +357,9 @@ class SchedulerDisaggregationPrefillMixin:
             )
             self.process_prefill_chunk()
             batch = self.get_new_batch_prefill()
+            if batch:
+                attrs = {"bid": hex(id(batch)), "batch_size": batch.batch_size()}
+                trace_event_batch("schedule", batch.reqs, attrs=attrs)
             if require_mlp_sync(self.server_args):
                 batch = self.prepare_mlp_sync_batch(batch)
@@ -423,6 +435,7 @@ class SchedulerDisaggregationPrefillMixin:
                 req.output_ids.append(next_token_id)
                 self.tree_cache.cache_unfinished_req(req)  # update the tree and lock
                 req.add_latency(RequestStage.PREFILL_FORWARD)
+                trace_slice(RequestStage.PREFILL_FORWARD, req.rid, auto_next_anon=True)
                 self.disagg_prefill_inflight_queue.append(req)
                 if self.spec_algorithm.is_eagle() and batch.spec_info is not None:
                     req.output_topk_p = batch.spec_info.topk_p[i]
@@ -487,6 +500,9 @@ class SchedulerDisaggregationPrefillMixin:
                 if self.enable_overlap:
                     self.send_kv_chunk(req, last_chunk=False, end_idx=req.tmp_end_idx)
+                trace_slice(
+                    RequestStage.PREFILL_CHUNKED_FORWARD, req.rid, auto_next_anon=True
+                )
         self.maybe_send_health_check_signal()
@@ -558,6 +574,9 @@ class SchedulerDisaggregationPrefillMixin:
             req.add_latency(RequestStage.PREFILL_TRANSFER_KV_CACHE)
             self.req_to_metadata_buffer_idx_allocator.free(req.metadata_buffer_index)
             req.metadata_buffer_index = -1
+            trace_slice(
+                RequestStage.PREFILL_TRANSFER_KV_CACHE, req.rid, thread_finish_flag=True
+            )
         self.disagg_prefill_inflight_queue = undone_reqs
@@ -569,7 +588,7 @@ class SchedulerDisaggregationPrefillMixin:
         """
         polls = poll_and_all_reduce(
             [req.disagg_kv_sender for req in self.disagg_prefill_inflight_queue],
-            self.tp_worker.get_tp_group().cpu_group,
+            self.tp_worker.get_attention_tp_cpu_group(),
         )
         transferred_rids: List[str] = []
@@ -703,8 +722,11 @@ class SchedulerDisaggregationPrefillMixin:
         else:
             data = None
-        if self.tp_size != 1:
+        if self.attn_tp_size != 1:
             data = broadcast_pyobj(
-                data, self.tp_group.rank, self.tp_cpu_group, src=self.tp_group.ranks[0]
+                data,
+                self.attn_tp_group.rank,
+                self.attn_tp_cpu_group,
+                src=self.attn_tp_group.ranks[0],
             )
         return data

sglang/srt/distributed/device_communicators/custom_all_reduce.py CHANGED Viewed

@@ -18,6 +18,7 @@ from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import
     is_weak_contiguous,
 )
 from sglang.srt.distributed.parallel_state import in_the_same_node_as
+from sglang.srt.environ import envs
 from sglang.srt.utils import is_cuda, is_hip, log_info_on_rank0
 logger = logging.getLogger(__name__)
@@ -210,6 +211,7 @@ class CustomAllreduce:
             self.register_buffer(self.buffer)
         self.disabled = False
+        self.tms_cudagraph = envs.SGLANG_MEMORY_SAVER_CUDA_GRAPH.get()
     @staticmethod
     def create_shared_buffer(
@@ -394,7 +396,7 @@ class CustomAllreduce:
                 if _is_hip:
                     return self.all_reduce_reg(input)
                 else:
-                    return self.all_reduce(input, registered=True)
+                    return self.all_reduce(input, registered=not self.tms_cudagraph)
             else:
                 # If warm up, mimic the allocation pattern since custom
                 # allreduce is out-of-place.

sglang/srt/distributed/parallel_state.py CHANGED Viewed

@@ -68,7 +68,7 @@ REDUCE_OP_SUM = int(torch.distributed.ReduceOp.SUM)
 @dataclass
 class GraphCaptureContext:
-    stream: torch.cuda.Stream if not _is_npu else torch.npu.Stream
+    stream: torch.get_device_module().Stream
 @dataclass
@@ -498,7 +498,7 @@ class GroupCoordinator:
                 maybe_pynccl_context = nullcontext()
             else:
                 maybe_pynccl_context = pynccl_comm.change_state(
-                    enable=True, stream=torch.cuda.current_stream()
+                    enable=True, stream=torch.get_device_module().current_stream()
                 )
             pymscclpp_comm = self.pymscclpp_comm
@@ -555,7 +555,7 @@ class GroupCoordinator:
             and input_.symmetric_memory
         ):
             with self.pynccl_comm.change_state(
-                enable=True, stream=torch.cuda.current_stream()
+                enable=True, stream=torch.get_device_module().current_stream()
             ):
                 self.pynccl_comm.all_reduce(input_)
                 return input_
@@ -655,7 +655,9 @@ class GroupCoordinator:
         world_size = self.world_size
         pynccl_comm = self.pynccl_comm
-        with pynccl_comm.change_state(enable=True, stream=torch.cuda.current_stream()):
+        with pynccl_comm.change_state(
+            enable=True, stream=torch.get_device_module().current_stream()
+        ):
             assert (
                 pynccl_comm is not None and not pynccl_comm.disabled
             ), "pynccl is required for reduce_scatterv"
@@ -779,7 +781,9 @@ class GroupCoordinator:
         world_size = self.world_size
         pynccl_comm = self.pynccl_comm
-        with pynccl_comm.change_state(enable=True, stream=torch.cuda.current_stream()):
+        with pynccl_comm.change_state(
+            enable=True, stream=torch.get_device_module().current_stream()
+        ):
             assert (
                 pynccl_comm is not None and not pynccl_comm.disabled
             ), "pynccl is required for all_gatherv"

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -143,10 +143,13 @@ class Engine(EngineBase):
         # Enable tracing
         if server_args.enable_trace:
-            process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
-            if server_args.disaggregation_mode == "null":
-                thread_label = "Tokenizer"
-                trace_set_thread_info(thread_label)
+            process_tracing_init(server_args.otlp_traces_endpoint, "sglang")
+            thread_label = "Tokenizer"
+            if server_args.disaggregation_mode == "prefill":
+                thread_label = "Prefill Tokenizer"
+            elif server_args.disaggregation_mode == "decode":
+                thread_label = "Decode Tokenizer"
+            trace_set_thread_info(thread_label)
         try:
             self.loop = asyncio.get_running_loop()
@@ -312,6 +315,7 @@ class Engine(EngineBase):
         image_data: Optional[MultimodalDataInputFormat] = None,
         audio_data: Optional[MultimodalDataInputFormat] = None,
         video_data: Optional[MultimodalDataInputFormat] = None,
+        dimensions: Optional[int] = None,
     ) -> Dict:
         """
         The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
@@ -322,6 +326,7 @@ class Engine(EngineBase):
             image_data=image_data,
             audio_data=audio_data,
             video_data=video_data,
+            dimensions=dimensions,
         )
         generator = self.tokenizer_manager.generate_request(obj, None)
         ret = self.loop.run_until_complete(generator.__anext__())
@@ -333,6 +338,7 @@ class Engine(EngineBase):
         image_data: Optional[MultimodalDataInputFormat] = None,
         audio_data: Optional[MultimodalDataInputFormat] = None,
         video_data: Optional[MultimodalDataInputFormat] = None,
+        dimensions: Optional[int] = None,
     ) -> Dict:
         """
         Asynchronous version of encode method.
@@ -345,6 +351,7 @@ class Engine(EngineBase):
             image_data=image_data,
             audio_data=audio_data,
             video_data=video_data,
+            dimensions=dimensions,
         )
         generator = self.tokenizer_manager.generate_request(obj, None)
         return await generator.__anext__()
@@ -670,7 +677,8 @@ class Engine(EngineBase):
 def _set_envs_and_config(server_args: ServerArgs):
     # Set global environments
     os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
-    os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
+    if "NCCL_CUMEM_ENABLE" not in os.environ:
+        os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
     if not server_args.enable_symm_mem:
         os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
     os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "8"

sglang/srt/entrypoints/http_server.py CHANGED Viewed

@@ -220,9 +220,12 @@ async def lifespan(fast_api_app: FastAPI):
     # Init tracing
     if server_args.enable_trace:
-        process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
-        if server_args.disaggregation_mode == "null":
-            trace_set_thread_info(thread_label)
+        process_tracing_init(server_args.otlp_traces_endpoint, "sglang")
+        if server_args.disaggregation_mode == "prefill":
+            thread_label = "Prefill" + thread_label
+        elif server_args.disaggregation_mode == "decode":
+            thread_label = "Decode" + thread_label
+        trace_set_thread_info(thread_label)
     # Initialize OpenAI serving handlers
     fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
@@ -1168,6 +1171,8 @@ async def available_models():
     """Show available models. OpenAI-compatible endpoint."""
     served_model_names = [_global_state.tokenizer_manager.served_model_name]
     model_cards = []
+    # Add base model
     for served_model_name in served_model_names:
         model_cards.append(
             ModelCard(
@@ -1176,6 +1181,20 @@ async def available_models():
                 max_model_len=_global_state.tokenizer_manager.model_config.context_len,
             )
         )
+    # Add loaded LoRA adapters
+    if _global_state.tokenizer_manager.server_args.enable_lora:
+        lora_registry = _global_state.tokenizer_manager.lora_registry
+        for _, lora_ref in lora_registry.get_all_adapters().items():
+            model_cards.append(
+                ModelCard(
+                    id=lora_ref.lora_name,
+                    root=lora_ref.lora_path,
+                    parent=served_model_names[0],
+                    max_model_len=None,
+                )
+            )
     return ModelList(data=model_cards)

sglang/srt/entrypoints/openai/protocol.py CHANGED Viewed

@@ -37,7 +37,11 @@ from pydantic import (
     model_validator,
 )
 from typing_extensions import Literal
-from xgrammar import StructuralTag
+try:
+    from xgrammar import StructuralTag
+except:
+    StructuralTag = Any
 from sglang.utils import convert_json_schema_to_str
@@ -54,6 +58,7 @@ class ModelCard(BaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     owned_by: str = "sglang"
     root: Optional[str] = None
+    parent: Optional[str] = None
     max_model_len: Optional[int] = None
@@ -108,6 +113,7 @@ class UsageInfo(BaseModel):
 class StreamOptions(BaseModel):
     include_usage: Optional[bool] = False
+    continuous_usage_stats: Optional[bool] = False
 class JsonSchemaResponseFormat(BaseModel):

sglang 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl