PyPI - sglang - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +73 -14
sglang/compile_deep_gemm.py +13 -7
sglang/launch_server.py +2 -0
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +221 -4
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/compilation/backend.py +1 -1
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +30 -7
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -12
sglang/srt/entrypoints/engine.py +31 -20
sglang/srt/entrypoints/grpc_server.py +0 -1
sglang/srt/entrypoints/http_server.py +94 -94
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +23 -2
sglang/srt/eplb/expert_distribution.py +64 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/minimax_m2.py +367 -0
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/activation.py +6 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +19 -8
sglang/srt/layers/attention/flashinfer_backend.py +10 -1
sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -11
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +11 -15
sglang/srt/layers/attention/utils.py +78 -0
sglang/srt/layers/communicator.py +24 -1
sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/layernorm.py +35 -6
sglang/srt/layers/logits_processor.py +9 -20
sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
sglang/srt/layers/moe/ep_moe/layer.py +78 -289
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
sglang/srt/layers/moe/moe_runner/deep_gemm.py +340 -55
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
sglang/srt/layers/moe/token_dispatcher/deepep.py +25 -18
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +35 -10
sglang/srt/layers/moe/utils.py +3 -4
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +13 -84
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/awq.py +0 -3
sglang/srt/layers/quantization/base_config.py +7 -0
sglang/srt/layers/quantization/fp8.py +68 -63
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gguf.py +566 -0
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/quantization/mxfp4.py +30 -38
sglang/srt/layers/quantization/unquant.py +23 -45
sglang/srt/layers/quantization/w4afp8.py +38 -2
sglang/srt/layers/radix_attention.py +5 -2
sglang/srt/layers/rotary_embedding.py +130 -46
sglang/srt/layers/sampler.py +12 -1
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +29 -4
sglang/srt/managers/multi_tokenizer_mixin.py +22 -1
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +185 -144
sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +165 -78
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/hicache_storage.py +7 -1
sglang/srt/mem_cache/memory_pool.py +253 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +55 -14
sglang/srt/model_executor/model_runner.py +77 -170
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +296 -78
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4_moe.py +322 -354
sglang/srt/models/glm4_moe_nextn.py +4 -14
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +29 -197
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +922 -0
sglang/srt/models/nvila.py +355 -0
sglang/srt/models/nvila_lite.py +184 -0
sglang/srt/models/qwen2.py +23 -2
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +35 -5
sglang/srt/models/qwen3_moe.py +18 -12
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multimodal/processors/base_processor.py +1 -0
sglang/srt/multimodal/processors/glm4v.py +1 -1
sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/parser/reasoning_parser.py +28 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +459 -199
sglang/srt/single_batch_overlap.py +2 -4
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +142 -74
sglang/srt/utils/hf_transformers_utils.py +38 -12
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_deterministic.py +235 -12
sglang/test/test_deterministic_utils.py +2 -1
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +15 -28
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +194 -175
sglang/srt/models/vila.py +0 -306
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/bench_one_batch.py CHANGED Viewed

@@ -11,6 +11,11 @@ python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruc
 python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
 ## run with profiling:
 python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
+## run with profiling to custom directory:
+export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 --input-len 256 --profile
+## run with CUDA profiler (nsys):
+nsys profile --force-overwrite=true -o bench_one_batch python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 --input-len 256 --profile --profiler_activities CUDA_PROFILER
 # Usage (correctness test):
 python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
@@ -93,6 +98,68 @@ profile_activities = [torch.profiler.ProfilerActivity.CPU] + [
 ]
+def start_profile(profiler_activities, profile_record_shapes=False, rank_print=print):
+    """
+    Abstracted function to start profiling based on profiler_activities.
+    Returns profiler object (or None).
+    """
+    if "CUDA_PROFILER" in profiler_activities:
+        try:
+            torch.cuda.cudart().cudaProfilerStart()
+            rank_print("CUDA Profiler started (nsys will begin capturing)")
+        except Exception as e:
+            rank_print(f"Failed to start CUDA profiler: {e}")
+        return None
+    else:
+        activities = []
+        if "CPU" in profiler_activities:
+            activities.append(torch.profiler.ProfilerActivity.CPU)
+        if "GPU" in profiler_activities:
+            activities.append(torch.profiler.ProfilerActivity.CUDA)
+        if activities:
+            profiler = torch.profiler.profile(
+                activities=activities,
+                with_stack=True,
+                record_shapes=profile_record_shapes,
+            )
+            profiler.start()
+            return profiler
+        return None
+def stop_profile(
+    profiler,
+    profiler_activities,
+    rank_print=print,
+    save_trace=False,
+    trace_filename=None,
+    stage=None,
+):
+    """
+    Abstracted function to stop profiling based on profiler_activities.
+    Optionally saves trace results and prints completion messages.
+    """
+    if "CUDA_PROFILER" in profiler_activities:
+        try:
+            torch.cuda.cudart().cudaProfilerStop()
+            rank_print("CUDA Profiler stopped (nsys should dump traces)")
+        except Exception as e:
+            rank_print(f"Failed to stop CUDA profiler: {e}")
+    elif profiler is not None:
+        profiler.stop()
+    if save_trace:
+        if profiler is not None:
+            if trace_filename:
+                _save_profile_trace_results(profiler, trace_filename)
+                stage_desc = f"for {stage}" if stage else ""
+                rank_print(
+                    f"torch profiler chrome trace {stage_desc} saved to {trace_filename}"
+                )
+        if "CUDA_PROFILER" in profiler_activities:
+            rank_print(f"CUDA profiler trace for {stage} completed")
 @dataclasses.dataclass
 class BenchArgs:
     run_name: str = "default"
@@ -107,6 +174,8 @@ class BenchArgs:
     log_decode_step: int = 0
     profile: bool = False
     profile_record_shapes: bool = False
+    profiler_activities: Tuple[str] = ("CPU", "GPU")
+    profile_stage: str = "all"
     profile_filename_prefix: str = "profile"
     @staticmethod
@@ -135,14 +204,27 @@ class BenchArgs:
             default=BenchArgs.log_decode_step,
             help="Log decode latency by step, default is set to zero to disable.",
         )
-        parser.add_argument(
-            "--profile", action="store_true", help="Use Torch Profiler."
-        )
+        parser.add_argument("--profile", action="store_true", help="Enable profiling.")
         parser.add_argument(
             "--profile-record-shapes",
             action="store_true",
             help="Record tensor shapes in profiling results.",
         )
+        parser.add_argument(
+            "--profiler_activities",
+            type=str,
+            nargs="+",
+            default=["CPU", "GPU"],
+            choices=["CPU", "GPU", "CUDA_PROFILER"],
+            help="Profiler activities: CPU, GPU, CUDA_PROFILER. If CPU/GPU, use torch profiler. If CUDA_PROFILER, use CUDA profiler.",
+        )
+        parser.add_argument(
+            "--profile-stage",
+            type=str,
+            default=BenchArgs.profile_stage,
+            choices=["all", "prefill", "decode"],
+            help="Which stage to profile: all, prefill, or decode only.",
+        )
         parser.add_argument(
             "--profile-filename-prefix",
             type=str,
@@ -337,6 +419,18 @@ def _read_prompts_from_file(prompt_file, rank_print):
         return pf.readlines()
+def _get_torch_profiler_output_dir():
+    return os.environ.get("SGLANG_TORCH_PROFILER_DIR", "/tmp")
+def _create_torch_profiler_filename(
+    profile_filename_prefix, batch_size, input_len, output_len, stage
+):
+    output_dir = _get_torch_profiler_output_dir()
+    filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_{stage}.trace.json.gz"
+    return os.path.join(output_dir, filename)
 def _save_profile_trace_results(profiler, filename):
     parent_dir = os.path.dirname(os.path.abspath(filename))
     os.makedirs(parent_dir, exist_ok=True)
@@ -413,7 +507,10 @@ def latency_test_run_once(
     log_decode_step,
     profile,
     profile_record_shapes,
+    profiler_activities,
     profile_filename_prefix,
+    profile_stage,
+    tp_rank,
 ):
     max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
     if batch_size > max_batch_size:
@@ -422,7 +519,6 @@ def latency_test_run_once(
         )
         return
-    # Clear the pools.
     model_runner.req_to_token_pool.clear()
     model_runner.token_to_kv_pool_allocator.clear()
@@ -436,20 +532,33 @@ def latency_test_run_once(
     tot_latency = 0
     profiler = None
-    if profile:
-        profiler = torch.profiler.profile(
-            activities=profile_activities,
-            with_stack=True,
-            record_shapes=profile_record_shapes,
+    enable_profile_prefill = profile and profile_stage in ["all", "prefill"]
+    if enable_profile_prefill:
+        profiler = start_profile(
+            profiler_activities,
+            profile_record_shapes=profile_record_shapes,
+            rank_print=rank_print,
         )
-        profiler.start()
-    # Prefill
     synchronize(device)
     tic = time.perf_counter()
     next_token_ids, _, batch = extend(reqs, model_runner)
     synchronize(device)
     prefill_latency = time.perf_counter() - tic
+    if enable_profile_prefill:
+        trace_filename = _create_torch_profiler_filename(
+            profile_filename_prefix, batch_size, input_len, output_len, "prefill"
+        )
+        stop_profile(
+            profiler,
+            profiler_activities,
+            rank_print=rank_print,
+            save_trace=True,
+            trace_filename=trace_filename,
+            stage="prefill",
+        )
     tot_latency += prefill_latency
     throughput = input_len * batch_size / prefill_latency
     rank_print(
@@ -458,29 +567,37 @@ def latency_test_run_once(
     measurement_results["prefill_latency"] = prefill_latency
     measurement_results["prefill_throughput"] = throughput
-    if profile:
-        profiler.stop()
-        trace_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_prefill.trace.json.gz"
-        _save_profile_trace_results(profiler, trace_filename)
-        rank_print(f"torch profiler chrome trace for prefill saved to {trace_filename}")
-    # Decode
     decode_latencies = []
+    profile_step_of_interest = output_len // 2
+    enable_profile_decode = profile and profile_stage in ["all", "decode"]
     for i in range(output_len - 1):
         synchronize(device)
-        if profile and i == output_len / 2:
-            profiler = None
-            profiler = torch.profiler.profile(
-                activities=profile_activities,
-                with_stack=True,
-                record_shapes=profile_record_shapes,
+        profiler = None
+        if enable_profile_decode and i == profile_step_of_interest:
+            profiler = start_profile(
+                profiler_activities,
+                profile_record_shapes=profile_record_shapes,
+                rank_print=rank_print,
             )
-            profiler.start()
         tic = time.perf_counter()
         next_token_ids, _ = decode(next_token_ids, batch, model_runner)
         synchronize(device)
         latency = time.perf_counter() - tic
+        if enable_profile_decode and i == profile_step_of_interest:
+            trace_filename = _create_torch_profiler_filename(
+                profile_filename_prefix, batch_size, input_len, output_len, "decode"
+            )
+            stop_profile(
+                profiler,
+                profiler_activities,
+                rank_print=rank_print,
+                save_trace=True,
+                trace_filename=trace_filename,
+                stage="decode",
+            )
         tot_latency += latency
         throughput = batch_size / latency
         decode_latencies.append(latency)
@@ -489,14 +606,6 @@ def latency_test_run_once(
                 f"Decode {i}. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
             )
-        if profile and i == output_len / 2:
-            profiler.stop()
-            trace_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_decode.trace.json.gz"
-            _save_profile_trace_results(profiler, trace_filename)
-            rank_print(
-                f"torch profiler chrome trace for decoding 1 token saved to {trace_filename}"
-            )
     # Record decode timing from 2nd output
     if output_len > 1:
         med_decode_latency = np.median(decode_latencies)
@@ -557,7 +666,10 @@ def latency_test(
         log_decode_step=0,
         profile=False,
         profile_record_shapes=False,
-        profile_filename_prefix="",  # not used
+        profiler_activities=("CPU", "GPU"),
+        profile_filename_prefix="",
+        profile_stage="all",
+        tp_rank=tp_rank,
     )
     rank_print("Benchmark ...")
@@ -604,7 +716,10 @@ def latency_test(
             bench_args.log_decode_step,
             bench_args.profile if tp_rank == 0 else None,
             bench_args.profile_record_shapes if tp_rank == 0 else None,
+            bench_args.profiler_activities,
             bench_args.profile_filename_prefix,
+            bench_args.profile_stage,
+            tp_rank,
         )
         if ret is not None:
             result_list.append(ret)

sglang/bench_serving.py CHANGED Viewed

@@ -88,6 +88,7 @@ class RequestFuncOutput:
     latency: float = 0.0
     ttft: float = 0.0  # Time to first token
     itl: List[float] = field(default_factory=list)  # List of inter-token latencies
+    text_chunks: List[str] = field(default_factory=list)
     prompt_len: int = 0
     error: str = ""
     output_len: int = 0
@@ -258,6 +259,9 @@ async def async_request_openai_completions(
                                 # Decoding phase
                                 else:
+                                    output.text_chunks.append(
+                                        data["choices"][0]["text"]
+                                    )
                                     output.itl.append(timestamp - most_recent_timestamp)
                                 most_recent_timestamp = timestamp
@@ -574,9 +578,8 @@ async def async_request_sglang_generate(
                                     num_new_tokens = output_len - last_output_len
                                     if num_new_tokens == 0:
                                         continue
-                                    adjust_itl = (
-                                        timestamp - most_recent_timestamp
-                                    ) / num_new_tokens
+                                    chunk_gap = timestamp - most_recent_timestamp
+                                    adjust_itl = chunk_gap / num_new_tokens
                                     output.itl.extend([adjust_itl] * num_new_tokens)
                                 most_recent_timestamp = timestamp
@@ -764,6 +767,7 @@ def get_dataset(args, tokenizer, model_id=None):
             image_content=args.image_content,
             image_format=args.image_format,
             image_resolution=args.image_resolution,
+            backend=args.backend,
         )
     elif args.dataset_name == "generated-shared-prefix":
         assert not tokenize_prompt
@@ -781,6 +785,7 @@ def get_dataset(args, tokenizer, model_id=None):
         input_requests = sample_mmmu_requests(
             num_requests=args.num_prompts,
             processor=processor,
+            backend=args.backend,
             fixed_output_len=args.random_output_len,
             random_sample=True,
         )
@@ -1009,6 +1014,7 @@ async def get_mooncake_request_over_time(
 def sample_mmmu_requests(
     num_requests: int,
     processor: AutoProcessor | AutoTokenizer,
+    backend: str = "sglang",
     fixed_output_len: Optional[int] = None,
     random_sample: bool = True,
 ) -> List[DatasetRow]:
@@ -1081,7 +1087,7 @@ def sample_mmmu_requests(
                 text_prompt = f"Question: {question}\n\nAnswer: "
                 output_len = fixed_output_len if fixed_output_len is not None else 256
                 data_row = create_mm_data_row(
-                    text_prompt, [image], [image_data], output_len, processor
+                    text_prompt, [image], [image_data], output_len, processor, backend
                 )
                 filtered_dataset.append(data_row)
@@ -1316,13 +1322,19 @@ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
     )
-def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor):
+def create_mm_data_row(
+    text_prompt, images: list, images_base64, output_len, processor, backend
+):
     try:
-        content_items = [
-            {"type": "image", "image": {"url": image_base64}}
-            for image_base64 in images_base64
-        ]
-        content_items.append({"type": "text", "text": text_prompt})
+        if type(processor).__name__ == "Phi4MMProcessor":
+            # <|endoftext10|> is the image token used in the phi-4-multimodal model.
+            content_items = text_prompt.replace("image 1", "|endoftext10|")
+        else:
+            content_items = [
+                {"type": "image", "image": {"url": image_base64}}
+                for image_base64 in images_base64
+            ]
+            content_items.append({"type": "text", "text": text_prompt})
         prompt_str = processor.apply_chat_template(
             [{"role": "user", "content": content_items}],
             add_generation_prompt=True,
@@ -1357,13 +1369,24 @@ def create_mm_data_row(text_prompt, images: list, images_base64, output_len, pro
         )["input_ids"].numel()
     except Exception:
         # Fallback: just tokenize the text prompt directly
-        text_prompt_len = len(processor.tokenizer.encode(text_prompt))
+        tokenizer_to_use = (
+            processor.tokenizer if hasattr(processor, "tokenizer") else processor
+        )
+        text_prompt_len = len(tokenizer_to_use.encode(text_prompt))
     # Vision tokens = total tokens - text tokens
     vision_prompt_len = prompt_len - text_prompt_len
+    use_raw_prompt = backend in [
+        "sglang-oai",
+        "sglang-oai-chat",
+        "vllm",
+        "vllm-chat",
+        "lmdeploy",
+        "lmdeploy-chat",
+    ]
     return DatasetRow(
-        prompt=text_prompt,
+        prompt=text_prompt if use_raw_prompt else prompt_str,
         prompt_len=prompt_len,
         output_len=output_len,
         text_prompt_len=text_prompt_len,
@@ -1382,6 +1405,7 @@ def sample_image_requests(
     image_content: str,
     image_format: str,
     image_resolution: str,
+    backend: str,
 ) -> List[DatasetRow]:
     """Generate requests with images.
@@ -1447,6 +1471,7 @@ def sample_image_requests(
             list(images_base64),
             int(output_lens[i]),
             processor,
+            backend,
         )
         dataset.append(data_row)
@@ -1607,6 +1632,7 @@ def calculate_metrics(
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
     backend: str,
+    accept_length: Optional[float] = None,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
     output_lens: List[int] = []
     retokenized_output_lens: List[int] = []
@@ -1618,6 +1644,14 @@ def calculate_metrics(
     tpots: List[float] = []
     ttfts: List[float] = []
     e2e_latencies: List[float] = []
+    retokenized_itls: List[float] = []
+    use_retokenized_itl = (
+        accept_length is not None
+        and accept_length > 0
+        and backend in ("sglang-oai", "sglang-oai-chat")
+    )
     for i in range(len(outputs)):
         if outputs[i].success:
             output_len = outputs[i].output_len
@@ -1631,7 +1665,17 @@ def calculate_metrics(
             total_input_vision += input_requests[i].vision_prompt_len
             if output_len > 1:
                 tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
-            itls += outputs[i].itl
+            if use_retokenized_itl:
+                for k, itl in enumerate(outputs[i].itl):
+                    num_tokens = len(
+                        tokenizer.encode(
+                            outputs[i].text_chunks[k], add_special_tokens=False
+                        )
+                    )
+                    adjusted_itl = itl / num_tokens
+                    retokenized_itls.extend([adjusted_itl] * num_tokens)
+            else:
+                itls += outputs[i].itl
             ttfts.append(outputs[i].ttft)
             e2e_latencies.append(outputs[i].latency)
@@ -1647,6 +1691,8 @@ def calculate_metrics(
             "on the benchmark arguments.",
             stacklevel=2,
         )
+    itls = retokenized_itls if use_retokenized_itl else itls
     metrics = BenchmarkMetrics(
         completed=completed,
         total_input=total_input,
@@ -1910,6 +1956,7 @@ async def benchmark(
         dur_s=benchmark_duration,
         tokenizer=tokenizer,
         backend=backend,
+        accept_length=accept_length,
     )
     print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
@@ -1989,6 +2036,7 @@ async def benchmark(
     ):
         result = {
             # Arguments
+            "tag": getattr(args, "tag", None),
             "backend": args.backend,
             "dataset_name": args.dataset_name,
             "request_rate": "trace" if use_trace_timestamps else request_rate,
@@ -2114,6 +2162,9 @@ def run_benchmark(args_: argparse.Namespace):
     if not hasattr(args, "mooncake_num_rounds"):
         args.mooncake_num_rounds = 1
+    if not hasattr(args, "served_model_name"):
+        args.served_model_name = None
     print(f"benchmark_args={args}")
     # Set global environments
@@ -2227,7 +2278,7 @@ def run_benchmark(args_: argparse.Namespace):
     # Read dataset
     backend = args.backend
-    model_id = args.model
+    model_id = args.served_model_name or args.model
     tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
     tokenizer = get_tokenizer(tokenizer_id)
     input_requests = get_dataset(args, tokenizer, model_id)
@@ -2326,6 +2377,11 @@ if __name__ == "__main__":
         type=str,
         help="Name or path of the model. If not set, the default model will request /v1/models for conf.",
     )
+    parser.add_argument(
+        "--served-model-name",
+        type=str,
+        help="The name of the model as served by the serving service. If not set, this defaults to the value of --model.",
+    )
     parser.add_argument(
         "--tokenizer",
         type=str,
@@ -2583,5 +2639,8 @@ if __name__ == "__main__":
         ],
         help="Underlying workload for the mooncake dataset.",
     )
+    parser.add_argument(
+        "--tag", type=str, default=None, help="The tag to be dumped to output."
+    )
     args = parser.parse_args()
     run_benchmark(args)

sglang/compile_deep_gemm.py CHANGED Viewed

@@ -104,15 +104,21 @@ def launch_server_process_and_send_one_request(
             if response.status_code == 200:
                 # Rank-0 node send a request to sync with other node and then return.
                 if server_args.node_rank == 0:
+                    payload = {
+                        "input_ids": [0, 1, 2, 3],
+                        "sampling_params": {
+                            "max_new_tokens": 8,
+                            "temperature": 0,
+                        },
+                    }
+                    # In PD mode, include fake bootstrap fields so workers don't assert
+                    if server_args.disaggregation_mode != "null":
+                        payload["bootstrap_host"] = FAKE_BOOTSTRAP_HOST
+                        payload["bootstrap_room"] = 0
                     response = requests.post(
                         f"{base_url}/generate",
-                        json={
-                            "input_ids": [0, 1, 2, 3],
-                            "sampling_params": {
-                                "max_new_tokens": 8,
-                                "temperature": 0,
-                            },
-                        },
+                        json=payload,
                         timeout=600,
                     )
                     if response.status_code != 200:

sglang/launch_server.py CHANGED Viewed

@@ -12,10 +12,12 @@ if __name__ == "__main__":
     try:
         if server_args.grpc_mode:
+            # Handle gRPC server
             from sglang.srt.entrypoints.grpc_server import serve_grpc
             asyncio.run(serve_grpc(server_args))
         else:
+            # Handle HTTP server
             from sglang.srt.entrypoints.http_server import launch_server
             launch_server(server_args)

sglang/srt/batch_invariant_ops/__init__.py CHANGED Viewed

@@ -9,6 +9,7 @@ from .batch_invariant_ops import (
     log_softmax,
     matmul_persistent,
     mean_dim,
+    rms_norm_batch_invariant,
     set_batch_invariant_mode,
 )
@@ -24,4 +25,5 @@ __all__ = [
     "mean_dim",
     "get_batch_invariant_attention_block_size",
     "AttentionBlockSize",
+    "rms_norm_batch_invariant",
 ]

sglang 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4py3-none-any.whl → 0.5.4.post2py3-none-any.whl