PyPI - sglang - Versions diffs - 0.4.6.post3__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl - Mend

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

sglang/bench_offline_throughput.py +4 -2
sglang/bench_one_batch.py +2 -2
sglang/bench_one_batch_server.py +143 -15
sglang/bench_serving.py +9 -7
sglang/compile_deep_gemm.py +1 -1
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +78 -78
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +2 -2
sglang/srt/configs/model_config.py +1 -0
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +69 -43
sglang/srt/conversation.py +48 -43
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +7 -2
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +227 -120
sglang/srt/disaggregation/nixl/conn.py +1 -0
sglang/srt/disaggregation/prefill.py +7 -4
sglang/srt/disaggregation/utils.py +7 -1
sglang/srt/entrypoints/engine.py +17 -2
sglang/srt/entrypoints/http_server.py +17 -2
sglang/srt/function_call_parser.py +2 -2
sglang/srt/layers/attention/flashattention_backend.py +1 -1
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/utils.py +4 -2
sglang/srt/layers/dp_attention.py +71 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
sglang/srt/layers/moe/topk.py +1 -1
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/deep_gemm.py +72 -71
sglang/srt/layers/quantization/fp8.py +2 -2
sglang/srt/layers/quantization/fp8_kernel.py +3 -3
sglang/srt/layers/quantization/int8_kernel.py +2 -2
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +1 -1
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/io_struct.py +3 -1
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/schedule_batch.py +76 -24
sglang/srt/managers/schedule_policy.py +0 -3
sglang/srt/managers/scheduler.py +113 -88
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/tokenizer_manager.py +133 -34
sglang/srt/managers/tp_worker.py +12 -9
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/memory_pool.py +2 -0
sglang/srt/metrics/collector.py +312 -37
sglang/srt/model_executor/cuda_graph_runner.py +10 -11
sglang/srt/model_executor/forward_batch_info.py +1 -1
sglang/srt/model_executor/model_runner.py +19 -14
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +23 -20
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llama4.py +5 -6
sglang/srt/models/llava.py +248 -5
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/openai_api/adapter.py +30 -4
sglang/srt/openai_api/protocol.py +0 -8
sglang/srt/reasoning_parser.py +3 -3
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +4 -56
sglang/srt/sampling/sampling_params.py +2 -2
sglang/srt/server_args.py +34 -4
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +7 -7
sglang/srt/speculative/eagle_worker.py +22 -19
sglang/srt/utils.py +6 -5
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +89 -14
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +6 -5
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +107 -104
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/deep_gemm.py CHANGED Viewed

@@ -15,12 +15,9 @@ _ENABLE_JIT_DEEPGEMM = False
 if is_cuda():
     import deep_gemm
     from deep_gemm import get_num_sms
+    from deep_gemm.jit.compiler import get_nvcc_compiler
     from deep_gemm.jit_kernels.gemm import get_best_configs
-    from deep_gemm.jit_kernels.gemm import includes as deep_gemm_includes
-    from deep_gemm.jit_kernels.gemm import template as deep_gemm_gemm_template
-    from deep_gemm.jit_kernels.m_grouped_gemm import (
-        template as deep_gemm_grouped_gemm_template,
-    )
+    from deep_gemm.jit_kernels.runtime import FP8GemmRuntime, GemmType
     from deep_gemm.jit_kernels.tuner import jit_tuner
     sm_version = get_device_sm()
@@ -45,10 +42,25 @@ _COMPILE_WORKERS = get_int_env_var("SGL_JIT_DEEPGEMM_COMPILE_WORKERS", 4)
 _IN_PRECOMPILE_STAGE = get_bool_env_var("SGL_IN_DEEPGEMM_PRECOMPILE_STAGE", "false")
 # Force redirect deep_gemm cache_dir
-os.environ["DG_CACHE_DIR"] = os.getenv(
-    "SGL_DG_CACHE_DIR", os.path.expanduser("~") + "/.cache/deep_gemm"
+os.environ["DG_JIT_CACHE_DIR"] = os.getenv(
+    "SGL_DG_CACHE_DIR", os.path.join(os.path.expanduser("~"), ".cache", "deep_gemm")
 )
+# Refer to https://github.com/deepseek-ai/DeepGEMM/commit/d75b218b7b8f4a5dd5406ac87905039ead3ae42f
+# NVRTC may have performance loss with some cases.
+# And NVCC JIT speed is also 9x faster in the ref commit
+_USE_NVRTC_DEFAULT = "0"
+if _ENABLE_JIT_DEEPGEMM:
+    try:
+        get_nvcc_compiler()
+    except:
+        logger.warning(
+            "NVCC Compiler not found, use NVRTC for DeepGEMM JIT "
+            "and may have performance loss with some cases."
+        )
+        _USE_NVRTC_DEFAULT = "1"
+os.environ["DG_JIT_USE_NVRTC"] = os.getenv("SGL_DG_USE_NVRTC", _USE_NVRTC_DEFAULT)
 def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
     global _BUILTIN_M_LIST
@@ -103,10 +115,10 @@ _INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dic
 def _compile_warning_1():
     if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
         logger.warning(
-            "Entering DeepGEMM JIT Pre-Complie session. "
+            "Entering DeepGEMM JIT Pre-Compile session. "
             "And it may takes a long time(Typically 10-20 mins) "
             "if you have not run `sglang.compile_deep_gemm`. "
-            "Recommand to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
+            "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
             " for pre-compilation to reduce the overhead if you have not run it before. "
             "For example: "
             "`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`"
@@ -115,7 +127,7 @@ def _compile_warning_1():
 def _compile_warning_2():
     logger.warning(
-        "Entering DeepGEMM JIT Single Kernel Complie session. "
+        "Entering DeepGEMM JIT Single Kernel Compile session. "
         "And it will makes inference throughput becomes flaky. "
         "Please run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
         " for pre-compilation to solve this issue. "
@@ -130,10 +142,18 @@ def _compile_grouped_gemm_nt_f8f8bf16_masked_one(
     num_groups: int,
     config: Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
 ) -> None:
-    # Auto-tuning with compilation
-    global deep_gemm_includes, deep_gemm_grouped_gemm_template
-    _, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
-    _ = jit_tuner.compile_and_tune(
+    num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
+    block_k = 128
+    num_tma_threads = 128
+    num_math_threads_per_group = 128
+    kwargs = {
+        "NUM_TMA_THREADS": num_tma_threads,
+        "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
+        "BLOCK_K": block_k,
+        "NUM_SMS": num_sms,
+        "SMEM_SIZE": smem_config[0],
+    }
+    _, _ = jit_tuner.compile_and_tune(
         name="m_grouped_gemm_fp8_fp8_bf16_nt",
         keys={
             "N": n,
@@ -146,24 +166,11 @@ def _compile_grouped_gemm_nt_f8f8bf16_masked_one(
             "NUM_STAGES": num_stages,
             "NUM_TMA_MULTICAST": tma_multicast_config[0],
             "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
-            "GEMM_TYPE": "GroupedMasked",
+            "GEMM_TYPE": GemmType.GroupedMasked,
         },
         space=(),
-        includes=deep_gemm_includes,
-        arg_defs=(
-            ("lhs", torch.float8_e4m3fn),
-            ("lhs_scales", torch.float),
-            ("rhs", torch.float8_e4m3fn),
-            ("rhs_scales", torch.float),
-            ("out", torch.bfloat16),
-            ("grouped_layout", torch.int32),
-            ("m", int),
-            ("stream", torch.cuda.Stream),
-            ("num_sms", int),
-            ("smem_size", int),
-        ),
-        template=deep_gemm_grouped_gemm_template,
-        args=[],
+        kwargs=kwargs,
+        runtime_cls=FP8GemmRuntime,
     )
@@ -173,9 +180,18 @@ def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
     num_groups: int,
     config: Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
 ) -> None:
-    global deep_gemm_includes, deep_gemm_grouped_gemm_template
-    _, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
-    _ = jit_tuner.compile_and_tune(
+    num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
+    block_k = 128
+    num_tma_threads = 128
+    num_math_threads_per_group = 128
+    kwargs = {
+        "NUM_TMA_THREADS": num_tma_threads,
+        "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
+        "BLOCK_K": block_k,
+        "NUM_SMS": num_sms,
+        "SMEM_SIZE": smem_config[0],
+    }
+    _, _ = jit_tuner.compile_and_tune(
         name="m_grouped_gemm_fp8_fp8_bf16_nt",
         keys={
             "N": n,
@@ -188,25 +204,11 @@ def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
             "NUM_STAGES": num_stages,
             "NUM_TMA_MULTICAST": tma_multicast_config[0],
             "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
-            "GEMM_TYPE": "GroupedContiguous",
+            "GEMM_TYPE": GemmType.GroupedContiguous,
         },
         space=(),
-        includes=deep_gemm_includes,
-        arg_defs=(
-            ("lhs", torch.float8_e4m3fn),
-            ("lhs_scales", torch.float),
-            ("rhs", torch.float8_e4m3fn),
-            ("rhs_scales", torch.float),
-            ("out", torch.bfloat16),
-            ("grouped_layout", torch.int32),
-            ("m", int),
-            ("num_groups", int),
-            ("stream", torch.cuda.Stream),
-            ("num_sms", int),
-            ("smem_size", int),
-        ),
-        template=deep_gemm_grouped_gemm_template,
-        args=[],
+        kwargs=kwargs,
+        runtime_cls=FP8GemmRuntime,
     )
@@ -216,9 +218,20 @@ def _compile_gemm_nt_f8f8bf16_one(
     _: int,  # _ is a dummy parameter to align with other interfaces
     config: Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
 ) -> None:
-    global deep_gemm_includes, deep_gemm_gemm_template
-    _, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
-    _ = jit_tuner.compile_and_tune(
+    num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
+    block_k = 128
+    num_tma_threads = 128
+    num_math_threads_per_group = 128
+    kwargs = {
+        "GEMM_TYPE": GemmType.Normal,
+        "NUM_TMA_THREADS": num_tma_threads,
+        "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
+        "NUM_GROUPS": 1,
+        "BLOCK_K": block_k,
+        "NUM_SMS": num_sms,
+        "SMEM_SIZE": smem_config[0],
+    }
+    _, _ = jit_tuner.compile_and_tune(
         name="gemm_fp8_fp8_bf16_nt",
         keys={
             "N": n,
@@ -232,20 +245,8 @@ def _compile_gemm_nt_f8f8bf16_one(
             "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
         },
         space=(),
-        includes=deep_gemm_includes,
-        arg_defs=(
-            ("lhs", torch.float8_e4m3fn),
-            ("lhs_scales", torch.float),
-            ("rhs", torch.float8_e4m3fn),
-            ("rhs_scales", torch.float),
-            ("out", torch.bfloat16),
-            ("m", int),
-            ("stream", torch.cuda.Stream),
-            ("num_sms", int),
-            ("smem_size", int),
-        ),
-        template=deep_gemm_gemm_template,
-        args=[],
+        kwargs=kwargs,
+        runtime_cls=FP8GemmRuntime,
     )
@@ -298,7 +299,7 @@ def _maybe_compile_deep_gemm_one_type_all(
         logger.info(
             f"Try DeepGEMM JIT Compiling for "
             f"<{kernel_helper.name}> N={n}, K={k}, num_groups={num_groups} with all Ms."
-            f"{' It only takes a litte time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
+            f"{' It only takes a little time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
         )
         # NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced
@@ -373,7 +374,7 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
     from deep_gemm.jit.runtime import RuntimeCache
-    origin_func = RuntimeCache.__getitem__
+    origin_func = RuntimeCache.get
     def __patched_func(self, *args, **kwargs):
         ret = origin_func(self, *args, **kwargs)
@@ -385,6 +386,6 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
             )
         return ret
-    RuntimeCache.__getitem__ = __patched_func
+    RuntimeCache.get = __patched_func
     yield
-    RuntimeCache.__getitem__ = origin_func
+    RuntimeCache.get = origin_func

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -235,7 +235,7 @@ class Fp8LinearMethod(LinearMethodBase):
                         f"{input_size_per_partition} is not divisible by "
                         f"weight quantization block_k = {block_k}."
                     )
-            # Required by collum parallel or enabling merged weights
+            # Required by column parallel or enabling merged weights
             if (
                 tp_size > 1 and output_size // output_size_per_partition == tp_size
             ) or len(output_partition_sizes) > 1:
@@ -491,7 +491,7 @@ class Fp8MoEMethod:
                 self.quant_config.weight_block_size[1],
             )
             # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
-            # Required by collum parallel or enabling merged weights
+            # Required by column parallel or enabling merged weights
             if intermediate_size % block_n != 0:
                 raise ValueError(
                     f"The output_size of gate's and up's weight = "

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -104,7 +104,7 @@ def _per_token_group_quant_fp8(
     y_s_ptr,
     # Stride of input
     y_stride,
-    # Collums of input
+    # Columns of input
     N,
     # Avoid to divide zero
     eps,
@@ -342,7 +342,7 @@ def _static_quant_fp8(
     y_s_repeat_ptr,
     # Stride of input
     y_stride,
-    # Collums of input
+    # Columns of input
     N,
     # Information for float8
     fp8_min,
@@ -794,7 +794,7 @@ def w8a8_block_fp8_matmul(
             config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
         else:
             # Default config
-            # Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
+            # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
             config = {
                 "BLOCK_SIZE_M": 64,
                 "BLOCK_SIZE_N": block_size[0],

sglang/srt/layers/quantization/int8_kernel.py CHANGED Viewed

@@ -76,7 +76,7 @@ def _per_token_group_quant_int8(
     y_s_ptr,
     # Stride of input
     y_stride,
-    # Collums of input
+    # Columns of input
     N,
     # Avoid to divide zero
     eps,
@@ -370,7 +370,7 @@ def w8a8_block_int8_matmul(
         config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
     else:
         # Default config
-        # Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
+        # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
         config = {
             "BLOCK_SIZE_M": 64,
             "BLOCK_SIZE_N": block_size[0],

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -239,10 +239,6 @@ def top_p_normalize_probs_torch(
 def get_top_logprobs(logprobs: torch.Tensor, top_logprobs_nums: List[int]):
-    assert len(top_logprobs_nums) == logprobs.shape[0], (
-        len(top_logprobs_nums),
-        logprobs.shape[0],
-    )
     max_k = max(top_logprobs_nums)
     ret = logprobs.topk(max_k, dim=1)
     values = ret.values.tolist()

sglang/srt/layers/vocab_parallel_embedding.py CHANGED Viewed

@@ -13,6 +13,7 @@ from sglang.srt.distributed import (
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
+from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
 from sglang.srt.layers.parameter import BasevLLMParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
@@ -214,12 +215,14 @@ class VocabParallelEmbedding(torch.nn.Module):
         self,
         num_embeddings: int,
         embedding_dim: int,
+        *,
         params_dtype: Optional[torch.dtype] = None,
         org_num_embeddings: Optional[int] = None,
         padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
         enable_tp: bool = True,
+        use_attn_tp_group: bool = False,
         use_presharded_weights: bool = False,
     ):
         super().__init__()
@@ -227,9 +230,14 @@ class VocabParallelEmbedding(torch.nn.Module):
         self.enable_tp = enable_tp
         if self.enable_tp:
-            tp_rank = get_tensor_model_parallel_rank()
-            self.tp_size = get_tensor_model_parallel_world_size()
+            if use_attn_tp_group:
+                tp_rank = get_attention_tp_rank()
+                self.tp_size = get_attention_tp_size()
+            else:
+                tp_rank = get_tensor_model_parallel_rank()
+                self.tp_size = get_tensor_model_parallel_world_size()
         else:
+            assert use_attn_tp_group is False
             tp_rank = 0
             self.tp_size = 1
@@ -519,22 +527,25 @@ class ParallelLMHead(VocabParallelEmbedding):
         self,
         num_embeddings: int,
         embedding_dim: int,
+        *,
         bias: bool = False,
         params_dtype: Optional[torch.dtype] = None,
         org_num_embeddings: Optional[int] = None,
         padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_attn_tp_group: bool = False,
         use_presharded_weights: bool = False,
     ):
         super().__init__(
             num_embeddings,
             embedding_dim,
-            params_dtype,
-            org_num_embeddings,
-            padding_size,
-            quant_config,
-            prefix,
+            params_dtype=params_dtype,
+            org_num_embeddings=org_num_embeddings,
+            padding_size=padding_size,
+            quant_config=quant_config,
+            prefix=prefix,
+            use_attn_tp_group=use_attn_tp_group,
             use_presharded_weights=use_presharded_weights,
         )
         self.quant_config = quant_config

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -100,7 +100,7 @@ class LoRAManager:
             self.configs[name] = LoRAConfig(path)
             self.hf_target_names.update(self.configs[name].target_modules)
-        # Target lora weight names for lora_a and lora_b modules repectively.
+        # Target lora weight names for lora_a and lora_b modules respectively.
         # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}
         self.lora_weight_names: Set[Tuple[str]] = set(
             [get_stacked_name(module) for module in self.hf_target_names]

sglang/srt/lora/mem_pool.py CHANGED Viewed

@@ -50,15 +50,15 @@ class LoRAMemoryPool:
         self.uid_to_buffer_id: Dict[Optional[str], int] = {}
         # Buffer idx -> lora uid in memory pool
-        # All uids are initalized as empty strings for empty buffer slots
-        # Here we don't initalize to None since None is a valid uid
+        # All uids are initialized as empty strings for empty buffer slots
+        # Here we don't initialize to None since None is a valid uid
         self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch
     def get_lora_A_shape(
         self, module_name: str, base_model: torch.nn.Module
     ) -> Tuple[int]:
         """
-        Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
+        Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
         """
         input_dim, _ = get_hidden_dim(module_name, self.base_hf_config, base_model)
         c = get_stacked_multiply(module_name)
@@ -75,7 +75,7 @@ class LoRAMemoryPool:
         self, module_name: str, base_model: torch.nn.Module
     ) -> Tuple[int]:
         """
-        Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
+        Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
         """
         _, output_dim = get_hidden_dim(module_name, self.base_hf_config, base_model)
         c = get_stacked_multiply(module_name)

sglang/srt/lora/triton_ops/gate_up_lora_b.py CHANGED Viewed

@@ -77,7 +77,7 @@ def _gate_up_lora_b_kernel(
         k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
     )
-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
     partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K)):
         x_tile = tl.load(

sglang/srt/lora/triton_ops/qkv_lora_b.py CHANGED Viewed

@@ -79,7 +79,7 @@ def _qkv_lora_b_kernel(
         k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
     )
-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
     partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K)):
         x_tile = tl.load(

sglang/srt/lora/triton_ops/sgemm_lora_a.py CHANGED Viewed

@@ -67,7 +67,7 @@ def _sgemm_lora_a_kernel(
         k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
     )
-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
     partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K)):
         x_tile = tl.load(

sglang/srt/lora/triton_ops/sgemm_lora_b.py CHANGED Viewed

@@ -69,7 +69,7 @@ def _sgemm_lora_b_kernel(
         k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
     )
-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
     partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_K)):
         x_tile = tl.load(

sglang/srt/lora/utils.py CHANGED Viewed

@@ -79,7 +79,7 @@ def get_hidden_dim(
     module_name: str, config: AutoConfig, base_model: torch.nn.Module
 ) -> Tuple[int]:
     """
-    Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
+    Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
     """
     if hasattr(base_model, "get_hidden_dim"):

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -17,13 +17,13 @@ import logging
 import multiprocessing as mp
 import signal
 import threading
+import time
 from enum import Enum, auto
 import psutil
 import setproctitle
 import zmq
-from sglang.srt.disaggregation.utils import DisaggregationMode
 from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
 from sglang.srt.managers.io_struct import (
     TokenizedEmbeddingReqInput,
@@ -158,7 +158,7 @@ class DataParallelController:
         # This thread cannot be closed because otherwise the `kill_itself_when_parent_died`
         # function in scheduler.py will kill the scheduler.
         while True:
-            pass
+            time.sleep(30 * 24 * 3600)
     def launch_dp_attention_schedulers(self, server_args, port_args):
         self.launch_tensor_parallel_group(server_args, port_args, 0, None)
@@ -210,7 +210,7 @@ class DataParallelController:
                     )
                     # compute zmq ports for this dp rank
                     rank_port_args = PortArgs.init_new(server_args, dp_rank)
-                    # Data parallelism resues the tensor parallelism group,
+                    # Data parallelism reuses the tensor parallelism group,
                     # so all dp ranks should use the same nccl port.
                     rank_port_args.nccl_port = port_args.nccl_port

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -28,6 +28,7 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.io_struct import (
     BatchEmbeddingOut,
     BatchMultimodalDecodeReq,
+    BatchMultimodalOut,
     BatchStrOut,
     BatchTokenIDOut,
 )
@@ -60,6 +61,8 @@ class DecodeStatus:
     decode_ids: List[int]
     surr_offset: int
     read_offset: int
+    # Offset that's sent to tokenizer for incremental update.
+    sent_offset: int = 0
 class DetokenizerManager:
@@ -151,7 +154,7 @@ class DetokenizerManager:
                 self.decode_status[rid] = s
             else:
                 s = self.decode_status[rid]
-                s.decode_ids = recv_obj.decode_ids[i]
+                s.decode_ids.extend(recv_obj.decode_ids[i])
             read_ids.append(
                 self.trim_matched_stop(
@@ -199,13 +202,15 @@ class DetokenizerManager:
                 else:
                     new_text = find_printable_text(new_text)
-            output_strs.append(
-                self.trim_matched_stop(
-                    s.decoded_text + new_text,
-                    recv_obj.finished_reasons[i],
-                    recv_obj.no_stop_trim[i],
-                )
+            output_str = self.trim_matched_stop(
+                s.decoded_text + new_text,
+                recv_obj.finished_reasons[i],
+                recv_obj.no_stop_trim[i],
             )
+            # Incrementally send text.
+            incremental_output = output_str[s.sent_offset :]
+            s.sent_offset = len(output_str)
+            output_strs.append(incremental_output)
         return BatchStrOut(
             rids=recv_obj.rids,
@@ -232,7 +237,15 @@ class DetokenizerManager:
         )
     def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
-        raise NotImplementedError()
+        outputs = self.tokenizer.detokenize(recv_obj)
+        return BatchMultimodalOut(
+            rids=recv_obj.rids,
+            finished_reasons=recv_obj.finished_reasons,
+            outputs=outputs,
+            prompt_tokens=recv_obj.prompt_tokens,
+            completion_tokens=recv_obj.completion_tokens,
+            cached_tokens=recv_obj.cached_tokens,
+        )
 class LimitedCapacityDict(OrderedDict):

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # limitations under the License.
 # ==============================================================================
 """
-The definition of objects transfered between different
+The definition of objects transferred between different
 processes (TokenizerManager, DetokenizerManager, Controller).
 """
@@ -836,6 +836,8 @@ class ProfileReqInput:
     # the caller doesn't need to run stop_profile.
     num_steps: Optional[int] = None
     activities: Optional[List[Literal["CPU", "GPU", "MEM", "CUDA_PROFILER"]]] = None
+    with_stack: Optional[bool] = None
+    record_shapes: Optional[bool] = None
 class ProfileReqType(Enum):

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -51,7 +51,7 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
         self, input_ids: List[int], mm_inputs: MultimodalInputs
     ) -> List[int]:
         """
-        This function will replace the data-tokens inbetween with pad_values accordingly
+        This function will replace the data-tokens in between with pad_values accordingly
         """
         pad_values = [item.pad_value for item in mm_inputs.mm_items]
         data_token_pairs = self.data_token_id_pairs

sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post4py3-none-any.whl