PyPI - sglang - Versions diffs - 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +18 -3
sglang/compile_deep_gemm.py +13 -7
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +120 -0
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +25 -2
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -5
sglang/srt/entrypoints/engine.py +13 -5
sglang/srt/entrypoints/http_server.py +22 -3
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +7 -0
sglang/srt/eplb/expert_distribution.py +34 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +7 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +3 -1
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +10 -2
sglang/srt/layers/communicator.py +23 -1
sglang/srt/layers/layernorm.py +16 -2
sglang/srt/layers/logits_processor.py +4 -20
sglang/srt/layers/moe/ep_moe/layer.py +0 -18
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/moe_runner/deep_gemm.py +53 -33
sglang/srt/layers/moe/token_dispatcher/deepep.py +12 -9
sglang/srt/layers/moe/topk.py +31 -6
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +9 -78
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/rotary_embedding.py +117 -45
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +26 -4
sglang/srt/managers/multi_tokenizer_mixin.py +5 -0
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +164 -129
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +154 -59
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/memory_pool.py +171 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +11 -11
sglang/srt/model_executor/model_runner.py +76 -21
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +149 -34
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +0 -1
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +1 -1
sglang/srt/models/qwen3_moe.py +16 -8
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +103 -22
sglang/srt/single_batch_overlap.py +4 -1
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +55 -32
sglang/srt/utils/hf_transformers_utils.py +38 -16
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +10 -24
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +150 -136
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/bench_one_batch.py CHANGED Viewed

@@ -11,6 +11,11 @@ python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruc
 python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
 ## run with profiling:
 python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
+## run with profiling to custom directory:
+export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 --input-len 256 --profile
+## run with CUDA profiler (nsys):
+nsys profile --force-overwrite=true -o bench_one_batch python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 --input-len 256 --profile --profiler_activities CUDA_PROFILER
 # Usage (correctness test):
 python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
@@ -93,6 +98,68 @@ profile_activities = [torch.profiler.ProfilerActivity.CPU] + [
 ]
+def start_profile(profiler_activities, profile_record_shapes=False, rank_print=print):
+    """
+    Abstracted function to start profiling based on profiler_activities.
+    Returns profiler object (or None).
+    """
+    if "CUDA_PROFILER" in profiler_activities:
+        try:
+            torch.cuda.cudart().cudaProfilerStart()
+            rank_print("CUDA Profiler started (nsys will begin capturing)")
+        except Exception as e:
+            rank_print(f"Failed to start CUDA profiler: {e}")
+        return None
+    else:
+        activities = []
+        if "CPU" in profiler_activities:
+            activities.append(torch.profiler.ProfilerActivity.CPU)
+        if "GPU" in profiler_activities:
+            activities.append(torch.profiler.ProfilerActivity.CUDA)
+        if activities:
+            profiler = torch.profiler.profile(
+                activities=activities,
+                with_stack=True,
+                record_shapes=profile_record_shapes,
+            )
+            profiler.start()
+            return profiler
+        return None
+def stop_profile(
+    profiler,
+    profiler_activities,
+    rank_print=print,
+    save_trace=False,
+    trace_filename=None,
+    stage=None,
+):
+    """
+    Abstracted function to stop profiling based on profiler_activities.
+    Optionally saves trace results and prints completion messages.
+    """
+    if "CUDA_PROFILER" in profiler_activities:
+        try:
+            torch.cuda.cudart().cudaProfilerStop()
+            rank_print("CUDA Profiler stopped (nsys should dump traces)")
+        except Exception as e:
+            rank_print(f"Failed to stop CUDA profiler: {e}")
+    elif profiler is not None:
+        profiler.stop()
+    if save_trace:
+        if profiler is not None:
+            if trace_filename:
+                _save_profile_trace_results(profiler, trace_filename)
+                stage_desc = f"for {stage}" if stage else ""
+                rank_print(
+                    f"torch profiler chrome trace {stage_desc} saved to {trace_filename}"
+                )
+        if "CUDA_PROFILER" in profiler_activities:
+            rank_print(f"CUDA profiler trace for {stage} completed")
 @dataclasses.dataclass
 class BenchArgs:
     run_name: str = "default"
@@ -107,6 +174,8 @@ class BenchArgs:
     log_decode_step: int = 0
     profile: bool = False
     profile_record_shapes: bool = False
+    profiler_activities: Tuple[str] = ("CPU", "GPU")
+    profile_stage: str = "all"
     profile_filename_prefix: str = "profile"
     @staticmethod
@@ -135,14 +204,27 @@ class BenchArgs:
             default=BenchArgs.log_decode_step,
             help="Log decode latency by step, default is set to zero to disable.",
         )
-        parser.add_argument(
-            "--profile", action="store_true", help="Use Torch Profiler."
-        )
+        parser.add_argument("--profile", action="store_true", help="Enable profiling.")
         parser.add_argument(
             "--profile-record-shapes",
             action="store_true",
             help="Record tensor shapes in profiling results.",
         )
+        parser.add_argument(
+            "--profiler_activities",
+            type=str,
+            nargs="+",
+            default=["CPU", "GPU"],
+            choices=["CPU", "GPU", "CUDA_PROFILER"],
+            help="Profiler activities: CPU, GPU, CUDA_PROFILER. If CPU/GPU, use torch profiler. If CUDA_PROFILER, use CUDA profiler.",
+        )
+        parser.add_argument(
+            "--profile-stage",
+            type=str,
+            default=BenchArgs.profile_stage,
+            choices=["all", "prefill", "decode"],
+            help="Which stage to profile: all, prefill, or decode only.",
+        )
         parser.add_argument(
             "--profile-filename-prefix",
             type=str,
@@ -337,6 +419,18 @@ def _read_prompts_from_file(prompt_file, rank_print):
         return pf.readlines()
+def _get_torch_profiler_output_dir():
+    return os.environ.get("SGLANG_TORCH_PROFILER_DIR", "/tmp")
+def _create_torch_profiler_filename(
+    profile_filename_prefix, batch_size, input_len, output_len, stage
+):
+    output_dir = _get_torch_profiler_output_dir()
+    filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_{stage}.trace.json.gz"
+    return os.path.join(output_dir, filename)
 def _save_profile_trace_results(profiler, filename):
     parent_dir = os.path.dirname(os.path.abspath(filename))
     os.makedirs(parent_dir, exist_ok=True)
@@ -413,7 +507,10 @@ def latency_test_run_once(
     log_decode_step,
     profile,
     profile_record_shapes,
+    profiler_activities,
     profile_filename_prefix,
+    profile_stage,
+    tp_rank,
 ):
     max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
     if batch_size > max_batch_size:
@@ -422,7 +519,6 @@ def latency_test_run_once(
         )
         return
-    # Clear the pools.
     model_runner.req_to_token_pool.clear()
     model_runner.token_to_kv_pool_allocator.clear()
@@ -436,20 +532,33 @@ def latency_test_run_once(
     tot_latency = 0
     profiler = None
-    if profile:
-        profiler = torch.profiler.profile(
-            activities=profile_activities,
-            with_stack=True,
-            record_shapes=profile_record_shapes,
+    enable_profile_prefill = profile and profile_stage in ["all", "prefill"]
+    if enable_profile_prefill:
+        profiler = start_profile(
+            profiler_activities,
+            profile_record_shapes=profile_record_shapes,
+            rank_print=rank_print,
         )
-        profiler.start()
-    # Prefill
     synchronize(device)
     tic = time.perf_counter()
     next_token_ids, _, batch = extend(reqs, model_runner)
     synchronize(device)
     prefill_latency = time.perf_counter() - tic
+    if enable_profile_prefill:
+        trace_filename = _create_torch_profiler_filename(
+            profile_filename_prefix, batch_size, input_len, output_len, "prefill"
+        )
+        stop_profile(
+            profiler,
+            profiler_activities,
+            rank_print=rank_print,
+            save_trace=True,
+            trace_filename=trace_filename,
+            stage="prefill",
+        )
     tot_latency += prefill_latency
     throughput = input_len * batch_size / prefill_latency
     rank_print(
@@ -458,29 +567,37 @@ def latency_test_run_once(
     measurement_results["prefill_latency"] = prefill_latency
     measurement_results["prefill_throughput"] = throughput
-    if profile:
-        profiler.stop()
-        trace_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_prefill.trace.json.gz"
-        _save_profile_trace_results(profiler, trace_filename)
-        rank_print(f"torch profiler chrome trace for prefill saved to {trace_filename}")
-    # Decode
     decode_latencies = []
+    profile_step_of_interest = output_len // 2
+    enable_profile_decode = profile and profile_stage in ["all", "decode"]
     for i in range(output_len - 1):
         synchronize(device)
-        if profile and i == output_len / 2:
-            profiler = None
-            profiler = torch.profiler.profile(
-                activities=profile_activities,
-                with_stack=True,
-                record_shapes=profile_record_shapes,
+        profiler = None
+        if enable_profile_decode and i == profile_step_of_interest:
+            profiler = start_profile(
+                profiler_activities,
+                profile_record_shapes=profile_record_shapes,
+                rank_print=rank_print,
             )
-            profiler.start()
         tic = time.perf_counter()
         next_token_ids, _ = decode(next_token_ids, batch, model_runner)
         synchronize(device)
         latency = time.perf_counter() - tic
+        if enable_profile_decode and i == profile_step_of_interest:
+            trace_filename = _create_torch_profiler_filename(
+                profile_filename_prefix, batch_size, input_len, output_len, "decode"
+            )
+            stop_profile(
+                profiler,
+                profiler_activities,
+                rank_print=rank_print,
+                save_trace=True,
+                trace_filename=trace_filename,
+                stage="decode",
+            )
         tot_latency += latency
         throughput = batch_size / latency
         decode_latencies.append(latency)
@@ -489,14 +606,6 @@ def latency_test_run_once(
                 f"Decode {i}. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
             )
-        if profile and i == output_len / 2:
-            profiler.stop()
-            trace_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_decode.trace.json.gz"
-            _save_profile_trace_results(profiler, trace_filename)
-            rank_print(
-                f"torch profiler chrome trace for decoding 1 token saved to {trace_filename}"
-            )
     # Record decode timing from 2nd output
     if output_len > 1:
         med_decode_latency = np.median(decode_latencies)
@@ -557,7 +666,10 @@ def latency_test(
         log_decode_step=0,
         profile=False,
         profile_record_shapes=False,
-        profile_filename_prefix="",  # not used
+        profiler_activities=("CPU", "GPU"),
+        profile_filename_prefix="",
+        profile_stage="all",
+        tp_rank=tp_rank,
     )
     rank_print("Benchmark ...")
@@ -604,7 +716,10 @@ def latency_test(
             bench_args.log_decode_step,
             bench_args.profile if tp_rank == 0 else None,
             bench_args.profile_record_shapes if tp_rank == 0 else None,
+            bench_args.profiler_activities,
             bench_args.profile_filename_prefix,
+            bench_args.profile_stage,
+            tp_rank,
         )
         if ret is not None:
             result_list.append(ret)

sglang/bench_serving.py CHANGED Viewed

@@ -1014,7 +1014,7 @@ async def get_mooncake_request_over_time(
 def sample_mmmu_requests(
     num_requests: int,
     processor: AutoProcessor | AutoTokenizer,
-    backend: str,
+    backend: str = "sglang",
     fixed_output_len: Optional[int] = None,
     random_sample: bool = True,
 ) -> List[DatasetRow]:
@@ -1369,7 +1369,10 @@ def create_mm_data_row(
         )["input_ids"].numel()
     except Exception:
         # Fallback: just tokenize the text prompt directly
-        text_prompt_len = len(processor.tokenizer.encode(text_prompt))
+        tokenizer_to_use = (
+            processor.tokenizer if hasattr(processor, "tokenizer") else processor
+        )
+        text_prompt_len = len(tokenizer_to_use.encode(text_prompt))
     # Vision tokens = total tokens - text tokens
     vision_prompt_len = prompt_len - text_prompt_len
@@ -2033,6 +2036,7 @@ async def benchmark(
     ):
         result = {
             # Arguments
+            "tag": getattr(args, "tag", None),
             "backend": args.backend,
             "dataset_name": args.dataset_name,
             "request_rate": "trace" if use_trace_timestamps else request_rate,
@@ -2158,6 +2162,9 @@ def run_benchmark(args_: argparse.Namespace):
     if not hasattr(args, "mooncake_num_rounds"):
         args.mooncake_num_rounds = 1
+    if not hasattr(args, "served_model_name"):
+        args.served_model_name = None
     print(f"benchmark_args={args}")
     # Set global environments
@@ -2271,7 +2278,7 @@ def run_benchmark(args_: argparse.Namespace):
     # Read dataset
     backend = args.backend
-    model_id = args.model
+    model_id = args.served_model_name or args.model
     tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
     tokenizer = get_tokenizer(tokenizer_id)
     input_requests = get_dataset(args, tokenizer, model_id)
@@ -2370,6 +2377,11 @@ if __name__ == "__main__":
         type=str,
         help="Name or path of the model. If not set, the default model will request /v1/models for conf.",
     )
+    parser.add_argument(
+        "--served-model-name",
+        type=str,
+        help="The name of the model as served by the serving service. If not set, this defaults to the value of --model.",
+    )
     parser.add_argument(
         "--tokenizer",
         type=str,
@@ -2627,5 +2639,8 @@ if __name__ == "__main__":
         ],
         help="Underlying workload for the mooncake dataset.",
     )
+    parser.add_argument(
+        "--tag", type=str, default=None, help="The tag to be dumped to output."
+    )
     args = parser.parse_args()
     run_benchmark(args)

sglang/compile_deep_gemm.py CHANGED Viewed

@@ -104,15 +104,21 @@ def launch_server_process_and_send_one_request(
             if response.status_code == 200:
                 # Rank-0 node send a request to sync with other node and then return.
                 if server_args.node_rank == 0:
+                    payload = {
+                        "input_ids": [0, 1, 2, 3],
+                        "sampling_params": {
+                            "max_new_tokens": 8,
+                            "temperature": 0,
+                        },
+                    }
+                    # In PD mode, include fake bootstrap fields so workers don't assert
+                    if server_args.disaggregation_mode != "null":
+                        payload["bootstrap_host"] = FAKE_BOOTSTRAP_HOST
+                        payload["bootstrap_room"] = 0
                     response = requests.post(
                         f"{base_url}/generate",
-                        json={
-                            "input_ids": [0, 1, 2, 3],
-                            "sampling_params": {
-                                "max_new_tokens": 8,
-                                "temperature": 0,
-                            },
-                        },
+                        json=payload,
                         timeout=600,
                     )
                     if response.status_code != 200:

sglang/srt/batch_invariant_ops/__init__.py CHANGED Viewed

@@ -9,6 +9,7 @@ from .batch_invariant_ops import (
     log_softmax,
     matmul_persistent,
     mean_dim,
+    rms_norm_batch_invariant,
     set_batch_invariant_mode,
 )
@@ -24,4 +25,5 @@ __all__ = [
     "mean_dim",
     "get_batch_invariant_attention_block_size",
     "AttentionBlockSize",
+    "rms_norm_batch_invariant",
 ]

sglang/srt/batch_invariant_ops/batch_invariant_ops.py CHANGED Viewed

@@ -579,6 +579,126 @@ def bmm_batch_invariant(a, b, *, out=None):
         )
+@triton.jit
+def _rms_norm_kernel(
+    input_ptr,
+    weight_ptr,
+    output_ptr,
+    input_row_stride,
+    output_row_stride,
+    n_cols,
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Compute RMS normalization along the last dimension of a 2D tensor.
+    RMS Norm: y = x / sqrt(mean(x^2) + eps) * weight
+    Each block handles one row of the input tensor.
+    """
+    row_idx = tl.program_id(0).to(tl.int64)
+    row_start_ptr = input_ptr + row_idx * input_row_stride
+    output_row_start_ptr = output_ptr + row_idx * output_row_stride
+    # Step 1: Compute sum of squares in float32 to avoid overflow
+    sum_sq = tl.zeros([1], dtype=tl.float32)
+    for col_offset in range(0, n_cols, BLOCK_SIZE):
+        col_idx = col_offset + tl.arange(0, BLOCK_SIZE)
+        mask = col_idx < n_cols
+        vals = tl.load(row_start_ptr + col_idx, mask=mask, other=0.0)
+        # Convert to float32 for accumulation to prevent overflow
+        vals_f32 = vals.to(tl.float32)
+        sq_vals = vals_f32 * vals_f32
+        sum_sq += tl.sum(tl.where(mask, sq_vals, 0.0))
+    # Step 2: Compute RMS (root mean square) in float32
+    mean_sq = sum_sq / n_cols
+    rms = tl.sqrt(mean_sq + eps)
+    inv_rms = 1.0 / rms
+    # Step 3: Normalize and apply weight
+    for col_offset in range(0, n_cols, BLOCK_SIZE):
+        col_idx = col_offset + tl.arange(0, BLOCK_SIZE)
+        mask = col_idx < n_cols
+        vals = tl.load(row_start_ptr + col_idx, mask=mask, other=0.0)
+        weight = tl.load(weight_ptr + col_idx, mask=mask, other=1.0)
+        # Compute in float32 then convert back to input dtype
+        vals_f32 = vals.to(tl.float32)
+        weight_f32 = weight.to(tl.float32)
+        output_f32 = vals_f32 * inv_rms * weight_f32
+        output = output_f32.to(vals.dtype)
+        tl.store(output_row_start_ptr + col_idx, output, mask=mask)
+def rms_norm(
+    input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6
+) -> torch.Tensor:
+    """
+    Compute RMS normalization using Triton kernel.
+    RMS Norm normalizes the input by the root mean square and scales by weight:
+    output = input / sqrt(mean(input^2) + eps) * weight
+    Args:
+        input: Input tensor of shape (..., hidden_size)
+        weight: Weight tensor of shape (hidden_size,)
+        eps: Small constant for numerical stability
+    Returns:
+        Tensor with RMS normalization applied along the last dimension
+    """
+    assert weight.dim() == 1, "Weight must be 1-dimensional"
+    assert input.shape[-1] == weight.shape[0], (
+        f"Input last dimension ({input.shape[-1]}) must match "
+        f"weight dimension ({weight.shape[0]})"
+    )
+    # Flatten all dimensions except the last one
+    original_shape = input.shape
+    input_2d = input.reshape(-1, input.shape[-1])
+    input_2d = input_2d.contiguous()
+    weight = weight.contiguous()
+    n_rows, n_cols = input_2d.shape
+    output = torch.empty_like(input_2d)
+    BLOCK_SIZE = 1024
+    grid = (n_rows,)
+    _rms_norm_kernel[grid](
+        input_2d,
+        weight,
+        output,
+        input_2d.stride(0),
+        output.stride(0),
+        n_cols,
+        eps,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    return output.reshape(original_shape)
+def rms_norm_batch_invariant(
+    input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6
+) -> torch.Tensor:
+    """
+    Batch-invariant wrapper for RMS normalization.
+    This function provides a deterministic, batch-invariant implementation
+    of RMS normalization for use with the batch_invariant mode.
+    Adapted from @https://github.com/vllm-project/vllm/blob/66a168a197ba214a5b70a74fa2e713c9eeb3251a/vllm/model_executor/layers/batch_invariant.py#L649
+    Args:
+        input: Input tensor of shape (..., hidden_size)
+        weight: Weight tensor of shape (hidden_size,)
+        eps: Small constant for numerical stability
+    Returns:
+        RMS normalized tensor
+    """
+    return rms_norm(input, weight, eps=eps)
 _batch_invariant_MODE = False
 _batch_invariant_LIB = None
 _original_torch_bmm = None

sglang/srt/checkpoint_engine/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""
+Checkpoint engine module for SGLang.
+This module provides functionality for updating model weights via checkpoint engine.
+"""
+from sglang.srt.checkpoint_engine.update import main
+__all__ = ["main"]

sglang 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl