PyPI - sglang - Versions diffs - 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl - Mend

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

sglang/__init__.py +8 -3
sglang/bench_one_batch.py +119 -17
sglang/lang/chat_template.py +18 -0
sglang/srt/bench_utils.py +137 -0
sglang/srt/configs/model_config.py +42 -7
sglang/srt/conversation.py +9 -5
sglang/srt/disaggregation/base/conn.py +5 -2
sglang/srt/disaggregation/decode.py +14 -4
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
sglang/srt/disaggregation/mooncake/conn.py +286 -160
sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
sglang/srt/disaggregation/prefill.py +2 -0
sglang/srt/distributed/parallel_state.py +15 -11
sglang/srt/entrypoints/context.py +227 -0
sglang/srt/entrypoints/engine.py +15 -9
sglang/srt/entrypoints/harmony_utils.py +372 -0
sglang/srt/entrypoints/http_server.py +74 -4
sglang/srt/entrypoints/openai/protocol.py +218 -1
sglang/srt/entrypoints/openai/serving_chat.py +41 -11
sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
sglang/srt/entrypoints/openai/tool_server.py +175 -0
sglang/srt/entrypoints/tool.py +87 -0
sglang/srt/eplb/expert_location.py +5 -1
sglang/srt/function_call/ebnf_composer.py +1 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +331 -0
sglang/srt/function_call/kimik2_detector.py +3 -3
sglang/srt/function_call/qwen3_coder_detector.py +219 -9
sglang/srt/hf_transformers_utils.py +30 -3
sglang/srt/jinja_template_utils.py +14 -1
sglang/srt/layers/attention/aiter_backend.py +375 -115
sglang/srt/layers/attention/ascend_backend.py +3 -0
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
sglang/srt/layers/attention/flashattention_backend.py +18 -0
sglang/srt/layers/attention/flashinfer_backend.py +52 -13
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +85 -14
sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
sglang/srt/layers/attention/vision.py +22 -6
sglang/srt/layers/attention/wave_backend.py +627 -0
sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
sglang/srt/layers/communicator.py +29 -14
sglang/srt/layers/dp_attention.py +12 -0
sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
sglang/srt/layers/linear.py +3 -7
sglang/srt/layers/moe/cutlass_moe.py +12 -3
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
sglang/srt/layers/moe/ep_moe/layer.py +135 -73
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
sglang/srt/layers/moe/topk.py +16 -4
sglang/srt/layers/moe/utils.py +16 -0
sglang/srt/layers/quantization/__init__.py +27 -3
sglang/srt/layers/quantization/fp4.py +557 -0
sglang/srt/layers/quantization/fp8.py +3 -6
sglang/srt/layers/quantization/fp8_kernel.py +277 -0
sglang/srt/layers/quantization/fp8_utils.py +51 -10
sglang/srt/layers/quantization/modelopt_quant.py +258 -68
sglang/srt/layers/quantization/mxfp4.py +654 -0
sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
sglang/srt/layers/quantization/quark/utils.py +107 -0
sglang/srt/layers/quantization/unquant.py +60 -6
sglang/srt/layers/quantization/w4afp8.py +21 -12
sglang/srt/layers/quantization/w8a8_int8.py +48 -34
sglang/srt/layers/rotary_embedding.py +506 -3
sglang/srt/layers/utils.py +9 -0
sglang/srt/layers/vocab_parallel_embedding.py +8 -3
sglang/srt/lora/backend/base_backend.py +3 -23
sglang/srt/lora/layers.py +60 -114
sglang/srt/lora/lora.py +17 -62
sglang/srt/lora/lora_manager.py +82 -62
sglang/srt/lora/lora_registry.py +23 -11
sglang/srt/lora/mem_pool.py +63 -68
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/utils.py +25 -58
sglang/srt/managers/cache_controller.py +75 -58
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +20 -8
sglang/srt/managers/mm_utils.py +6 -13
sglang/srt/managers/multimodal_processor.py +1 -1
sglang/srt/managers/schedule_batch.py +61 -25
sglang/srt/managers/schedule_policy.py +6 -6
sglang/srt/managers/scheduler.py +41 -19
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
sglang/srt/managers/scheduler_recv_skipper.py +37 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
sglang/srt/managers/template_manager.py +35 -1
sglang/srt/managers/tokenizer_manager.py +47 -30
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
sglang/srt/mem_cache/allocator.py +61 -87
sglang/srt/mem_cache/hicache_storage.py +1 -1
sglang/srt/mem_cache/hiradix_cache.py +80 -22
sglang/srt/mem_cache/lora_radix_cache.py +421 -0
sglang/srt/mem_cache/memory_pool_host.py +34 -36
sglang/srt/mem_cache/multimodal_cache.py +33 -13
sglang/srt/mem_cache/radix_cache.py +2 -5
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
sglang/srt/model_executor/cuda_graph_runner.py +29 -9
sglang/srt/model_executor/forward_batch_info.py +61 -19
sglang/srt/model_executor/model_runner.py +148 -37
sglang/srt/model_loader/loader.py +18 -6
sglang/srt/model_loader/weight_utils.py +10 -0
sglang/srt/models/bailing_moe.py +425 -0
sglang/srt/models/deepseek_v2.py +137 -59
sglang/srt/models/ernie4.py +426 -0
sglang/srt/models/ernie4_eagle.py +203 -0
sglang/srt/models/gemma2.py +0 -34
sglang/srt/models/gemma3n_mm.py +38 -0
sglang/srt/models/glm4.py +6 -0
sglang/srt/models/glm4_moe.py +28 -16
sglang/srt/models/glm4v.py +589 -0
sglang/srt/models/glm4v_moe.py +400 -0
sglang/srt/models/gpt_oss.py +1251 -0
sglang/srt/models/granite.py +0 -25
sglang/srt/models/llama.py +0 -25
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/qwen2.py +6 -0
sglang/srt/models/qwen2_5_vl.py +7 -3
sglang/srt/models/qwen2_audio.py +10 -9
sglang/srt/models/qwen2_moe.py +6 -0
sglang/srt/models/qwen3.py +0 -24
sglang/srt/models/qwen3_moe.py +32 -6
sglang/srt/models/registry.py +1 -1
sglang/srt/models/step3_vl.py +9 -0
sglang/srt/models/torch_native_llama.py +0 -24
sglang/srt/models/transformers.py +2 -5
sglang/srt/multimodal/processors/base_processor.py +23 -13
sglang/srt/multimodal/processors/glm4v.py +132 -0
sglang/srt/multimodal/processors/qwen_audio.py +4 -2
sglang/srt/multimodal/processors/step3_vl.py +3 -1
sglang/srt/reasoning_parser.py +332 -37
sglang/srt/server_args.py +186 -75
sglang/srt/speculative/eagle_worker.py +16 -0
sglang/srt/two_batch_overlap.py +169 -9
sglang/srt/utils.py +41 -5
sglang/srt/weight_sync/tensor_bucket.py +106 -0
sglang/test/attention/test_trtllm_mla_backend.py +186 -36
sglang/test/doc_patch.py +59 -0
sglang/test/few_shot_gsm8k.py +1 -1
sglang/test/few_shot_gsm8k_engine.py +1 -1
sglang/test/run_eval.py +4 -1
sglang/test/runners.py +2 -2
sglang/test/simple_eval_common.py +6 -0
sglang/test/simple_eval_gpqa.py +2 -0
sglang/test/test_fp4_moe.py +118 -36
sglang/test/test_utils.py +1 -1
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
sglang/srt/lora/backend/flashinfer_backend.py +0 -131
/sglang/{api.py → lang/api.py} +0 -0
/sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # SGLang public APIs
 # Frontend Language APIs
-from sglang.api import (
+from sglang.global_config import global_config
+from sglang.lang.api import (
     Engine,
     Runtime,
     assistant,
@@ -25,22 +26,26 @@ from sglang.api import (
     user_end,
     video,
 )
-from sglang.global_config import global_config
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.lang.choices import (
     greedy_token_selection,
     token_length_normalized,
     unconditional_likelihood_normalized,
 )
+# Lazy import some libraries
 from sglang.utils import LazyImport
 from sglang.version import __version__
-ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
 Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
 LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
 OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
 VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
+# Runtime Engine APIs
+ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
+Engine = LazyImport("sglang.srt.entrypoints.engine", "Engine")
 __all__ = [
     "Engine",
     "Runtime",

sglang/bench_one_batch.py CHANGED Viewed

@@ -43,6 +43,7 @@ I'm going to the park
 """
 import argparse
+import copy
 import dataclasses
 import itertools
 import json
@@ -60,6 +61,7 @@ from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.distributed.parallel_state import destroy_distributed_environment
 from sglang.srt.entrypoints.engine import _set_envs_and_config
 from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.layers.moe.utils import DeepEPMode, MoeA2ABackend
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.managers.scheduler import Scheduler
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -84,12 +86,14 @@ class BenchArgs:
     batch_size: Tuple[int] = (1,)
     input_len: Tuple[int] = (1024,)
     output_len: Tuple[int] = (16,)
+    prompt_filename: str = ""
     result_filename: str = "result.jsonl"
     correctness_test: bool = False
     # This is only used for correctness test
     cut_len: int = 4
     log_decode_step: int = 0
     profile: bool = False
+    profile_record_shapes: bool = False
     profile_filename_prefix: str = "profile"
     @staticmethod
@@ -104,6 +108,9 @@ class BenchArgs:
         parser.add_argument(
             "--output-len", type=int, nargs="+", default=BenchArgs.output_len
         )
+        parser.add_argument(
+            "--prompt-filename", type=str, default=BenchArgs.prompt_filename
+        )
         parser.add_argument(
             "--result-filename", type=str, default=BenchArgs.result_filename
         )
@@ -118,6 +125,11 @@ class BenchArgs:
         parser.add_argument(
             "--profile", action="store_true", help="Use Torch Profiler."
         )
+        parser.add_argument(
+            "--profile-record-shapes",
+            action="store_true",
+            help="Record tensor shapes in profiling results.",
+        )
         parser.add_argument(
             "--profile-filename-prefix",
             type=str,
@@ -165,12 +177,16 @@ def load_model(server_args, port_args, tp_rank):
     return model_runner, tokenizer
-def prepare_inputs_for_correctness_test(bench_args, tokenizer):
-    prompts = [
-        "The capital of France is",
-        "The capital of the United Kindom is",
-        "Today is a sunny day and I like",
-    ]
+def prepare_inputs_for_correctness_test(bench_args, tokenizer, custom_prompts):
+    prompts = (
+        custom_prompts
+        if custom_prompts
+        else [
+            "The capital of France is",
+            "The capital of the United Kindom is",
+            "Today is a sunny day and I like",
+        ]
+    )
     input_ids = [tokenizer.encode(p) for p in prompts]
     sampling_params = SamplingParams(
         temperature=0,
@@ -211,8 +227,14 @@ def prepare_extend_inputs_for_correctness_test(
     return reqs
-def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
-    input_ids = np.random.randint(0, 10000, (batch_size, input_len), dtype=np.int32)
+def prepare_synthetic_inputs_for_latency_test(
+    batch_size, input_len, custom_inputs=None
+):
+    input_ids = (
+        custom_inputs
+        if custom_inputs
+        else np.random.randint(0, 10000, (batch_size, input_len), dtype=np.int32)
+    )
     sampling_params = SamplingParams(
         temperature=0,
         max_new_tokens=BenchArgs.output_len,
@@ -279,11 +301,40 @@ def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
             disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
             spec_algorithm=SpeculativeAlgorithm.NONE,
             speculative_num_draft_tokens=None,
+            enable_two_batch_overlap=model_runner.server_args.enable_two_batch_overlap,
+            enable_deepep_moe=MoeA2ABackend(
+                model_runner.server_args.moe_a2a_backend
+            ).is_deepep(),
+            deepep_mode=DeepEPMode(model_runner.server_args.deepep_mode),
             require_mlp_tp_gather=require_mlp_tp_gather(model_runner.server_args),
             disable_overlap_schedule=model_runner.server_args.disable_overlap_schedule,
         )
+def _read_prompts_from_file(prompt_file, rank_print):
+    """Read custom prompts from the file specified by `--prompt-filename`."""
+    if not prompt_file:
+        return []
+    if not os.path.exists(prompt_file):
+        rank_print(
+            f"Custom prompt file {prompt_file} not found. Using default inputs..."
+        )
+        return []
+    with open(prompt_file, "r") as pf:
+        return pf.readlines()
+def _save_profile_trace_results(profiler, filename):
+    parent_dir = os.path.dirname(os.path.abspath(filename))
+    os.makedirs(parent_dir, exist_ok=True)
+    profiler.export_chrome_trace(filename)
+    print(
+        profiler.key_averages(group_by_input_shape=True).table(
+            sort_by="self_cpu_time_total"
+        )
+    )
 def correctness_test(
     server_args,
     port_args,
@@ -298,7 +349,10 @@ def correctness_test(
     model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
     # Prepare inputs
-    input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
+    custom_prompts = _read_prompts_from_file(bench_args.prompt_filename, rank_print)
+    input_ids, reqs = prepare_inputs_for_correctness_test(
+        bench_args, tokenizer, custom_prompts
+    )
     rank_print(f"\n{input_ids=}\n")
     if bench_args.cut_len > 0:
@@ -344,6 +398,7 @@ def latency_test_run_once(
     device,
     log_decode_step,
     profile,
+    profile_record_shapes,
     profile_filename_prefix,
 ):
     max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
@@ -374,6 +429,7 @@ def latency_test_run_once(
                 torch.profiler.ProfilerActivity.CUDA,
             ],
             with_stack=True,
+            record_shapes=profile_record_shapes,
         )
         profiler.start()
@@ -391,10 +447,30 @@ def latency_test_run_once(
     measurement_results["prefill_latency"] = prefill_latency
     measurement_results["prefill_throughput"] = throughput
+    if profile:
+        profiler.stop()
+        profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_prefill.trace.json.gz"
+        _save_profile_trace_results(profiler, profile_filename)
+        rank_print(
+            f"torch profiler chrome trace for prefill saved to {profile_filename}"
+        )
     # Decode
     decode_latencies = []
     for i in range(output_len - 1):
         synchronize(device)
+        if profile and i == output_len / 2:
+            profiler = None
+            profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+                ],
+                with_stack=True,
+                record_shapes=profile_record_shapes,
+            )
+            profiler.start()
         tic = time.perf_counter()
         next_token_ids, _ = decode(next_token_ids, batch, model_runner)
         synchronize(device)
@@ -407,13 +483,13 @@ def latency_test_run_once(
                 f"Decode {i}. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
             )
-    if profile:
-        profiler.stop()
-        profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}.trace.json.gz"
-        parent_dir = os.path.dirname(os.path.abspath(profile_filename))
-        os.makedirs(parent_dir, exist_ok=True)
-        profiler.export_chrome_trace(profile_filename)
-        rank_print(f"torch profiler chrome trace saved to {profile_filename}")
+        if profile and i == output_len / 2:
+            profiler.stop()
+            profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_decode.trace.json.gz"
+            _save_profile_trace_results(profiler, profile_filename)
+            rank_print(
+                f"torch profiler chrome trace for decoding 1 token saved to {profile_filename}"
+            )
     # Record decode timing from 2nd output
     if output_len > 1:
@@ -469,17 +545,42 @@ def latency_test(
         server_args.device,
         log_decode_step=0,
         profile=False,
+        profile_record_shapes=False,
         profile_filename_prefix="",  # not used
     )
     rank_print("Benchmark ...")
+    custom_inputs = _read_prompts_from_file(bench_args.prompt_filename, rank_print)
+    custom_inputs = [tokenizer.encode(p.strip()) for p in custom_inputs]
+    custom_input_len = len(custom_inputs)
     # Run the sweep
     result_list = []
     for bs, il, ol in itertools.product(
         bench_args.batch_size, bench_args.input_len, bench_args.output_len
     ):
-        reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
+        bs_aligned_inputs = []
+        if custom_inputs:
+            if custom_input_len == bs:
+                bs_aligned_inputs = custom_inputs
+            elif custom_input_len > bs:
+                rank_print(
+                    f"Custom input size ({custom_input_len}) is larger than batch_size ({bs}). "
+                    f"Using the first {bs} prompts."
+                )
+                bs_aligned_inputs = copy.deepcopy(custom_inputs[:bs])
+            else:
+                rank_print(
+                    f"Custom input size ({custom_input_len}) is smaller than batch_size ({bs}). "
+                    f"Pad to the desired batch_size with the last prompt."
+                )
+                bs_aligned_inputs = copy.deepcopy(custom_inputs)
+                bs_aligned_inputs.extend(
+                    [bs_aligned_inputs[-1]] * (bs - custom_input_len)
+                )
+        reqs = prepare_synthetic_inputs_for_latency_test(bs, il, bs_aligned_inputs)
         ret = latency_test_run_once(
             bench_args.run_name,
             model_runner,
@@ -491,6 +592,7 @@ def latency_test(
             server_args.device,
             bench_args.log_decode_step,
             bench_args.profile if tp_rank == 0 else None,
+            bench_args.profile_record_shapes if tp_rank == 0 else None,
             bench_args.profile_filename_prefix,
         )
         if ret is not None:

sglang/lang/chat_template.py CHANGED Viewed

@@ -505,6 +505,22 @@ register_chat_template(
     )
 )
+# Reference: https://huggingface.co/docs/transformers/main/model_doc/glm4_v#usage-example
+register_chat_template(
+    ChatTemplate(
+        name="glm-4v",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("<|system|>\n", "\n"),
+            "user": ("<|user|>\n", "\n"),
+            "assistant": ("<|assistant|>\n", "\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=["<|user|>", "<|endoftext|>", "<|observation|>"],
+        image_token="<|image|>",
+    )
+)
 @register_chat_template_matching_function
 def match_deepseek(model_path: str):
@@ -562,6 +578,8 @@ def match_chat_ml(model_path: str):
         return "chatml"
     if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
         return "qwen2-vl"
+    if re.search(r"glm[-_]?4(\.\d+)?v", model_path, re.IGNORECASE):
+        return "glm-4v"
     if re.search(r"qwen.*(chat|instruct)", model_path, re.IGNORECASE) and not re.search(
         r"llava", model_path, re.IGNORECASE
     ):

sglang/srt/bench_utils.py ADDED Viewed

@@ -0,0 +1,137 @@
+import os
+import sys
+from contextlib import nullcontext
+import torch
+# NOTE copied and modified from DeepGEMM
+class suppress_stdout_stderr:
+    def __enter__(self):
+        self.outnull_file = open(os.devnull, "w")
+        self.errnull_file = open(os.devnull, "w")
+        self.old_stdout_fileno_undup = sys.stdout.fileno()
+        self.old_stderr_fileno_undup = sys.stderr.fileno()
+        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
+        self.old_stderr_fileno = os.dup(sys.stderr.fileno())
+        self.old_stdout = sys.stdout
+        self.old_stderr = sys.stderr
+        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
+        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
+        sys.stdout = self.outnull_file
+        sys.stderr = self.errnull_file
+        return self
+    def __exit__(self, *_):
+        sys.stdout = self.old_stdout
+        sys.stderr = self.old_stderr
+        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
+        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
+        os.close(self.old_stdout_fileno)
+        os.close(self.old_stderr_fileno)
+        self.outnull_file.close()
+        self.errnull_file.close()
+# NOTE copied and modified from DeepGEMM
+def bench_kineto(
+    fn,
+    kernel_names,
+    num_tests: int = 30,
+    suppress_kineto_output: bool = False,
+    trace_path: str = None,
+    flush_l2: bool = True,
+    with_multiple_kernels: bool = False,
+):
+    # Conflict with Nsight Systems
+    using_nsys = int(os.environ.get("SGLANG_NSYS_PROFILING", 0))
+    # By default, flush L2 with an excessive 8GB memset to give the GPU some (literal) chill time without full idle
+    flush_l2_size = int(8e9 // 4)
+    # For some auto-tuning kernels with prints
+    fn()
+    # Profile
+    suppress = (
+        suppress_stdout_stderr
+        if suppress_kineto_output and not using_nsys
+        else nullcontext
+    )
+    with suppress():
+        schedule = (
+            torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1)
+            if not using_nsys
+            else None
+        )
+        profiler = (
+            torch.profiler.profile(
+                activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule
+            )
+            if not using_nsys
+            else nullcontext()
+        )
+        with profiler:
+            for i in range(2):
+                for _ in range(num_tests):
+                    if flush_l2:
+                        torch.empty(
+                            flush_l2_size, dtype=torch.int, device="cuda"
+                        ).zero_()
+                    fn()
+                if not using_nsys:
+                    profiler.step()
+    # Return 1 if using Nsight Systems
+    if using_nsys:
+        return 1
+    # Parse the profiling table
+    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
+    is_tuple = isinstance(kernel_names, tuple)
+    prof_lines = (
+        profiler.key_averages()
+        .table(sort_by="cuda_time_total", max_name_column_width=100)
+        .split("\n")
+    )
+    kernel_names = (kernel_names,) if isinstance(kernel_names, str) else kernel_names
+    assert all([isinstance(name, str) for name in kernel_names])
+    if not with_multiple_kernels:
+        for name in kernel_names:
+            assert (
+                sum([name in line for line in prof_lines]) == 1
+            ), f"Errors of the kernel {name} in the profiling table (table: {prof_lines})"
+    # Save chrome traces
+    if trace_path is not None:
+        profiler.export_chrome_trace(trace_path)
+    # Return average kernel times
+    units = {"ms": 1e3, "us": 1e6}
+    kernel_times = []
+    for name in kernel_names:
+        total_time = 0
+        total_num = 0
+        for line in prof_lines:
+            if name in line:
+                time_str = line.split()[-2]
+                num_str = line.split()[-1]
+                for unit, scale in units.items():
+                    if unit in time_str:
+                        total_time += (
+                            float(time_str.replace(unit, "")) / scale * int(num_str)
+                        )
+                        total_num += int(num_str)
+                        break
+        kernel_times.append(total_time / total_num)
+    return tuple(kernel_times) if is_tuple else kernel_times[0]

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -27,6 +27,7 @@ from sglang.srt.hf_transformers_utils import (
     get_context_length,
     get_generation_config,
     get_hf_text_config,
+    get_sparse_attention_config,
 )
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.server_args import ServerArgs
@@ -63,13 +64,12 @@ class ModelConfig:
         hybrid_kvcache_ratio: Optional[float] = None,
         model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
+        # Parse args
         self.model_path = model_path
         self.revision = revision
         self.quantization = quantization
         self.model_impl = model_impl
-        # Parse args
         self.maybe_pull_model_tokenizer_from_remote()
         self.model_override_args = json.loads(model_override_args)
         kwargs = {}
@@ -133,6 +133,12 @@ class ModelConfig:
         if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
             self.hf_config.architectures[0] = "MiMoMTP"
+        if (
+            is_draft_model
+            and self.hf_config.architectures[0] == "Ernie4_5_MoeForCausalLM"
+        ):
+            self.hf_config.architectures[0] = "Ernie4_5_MoeForCausalLMMTP"
         # Check model type
         self.is_generation = is_generation_model(
             self.hf_config.architectures, is_embedding
@@ -270,15 +276,16 @@ class ModelConfig:
         # Verify quantization
         self._verify_quantization()
+        # Verify dual-chunk attention config
+        self._verify_dual_chunk_attention_config()
         # Cache attributes
         self.hf_eos_token_id = self.get_hf_eos_token_id()
-        config = self.hf_config
         # multimodal
-        self.image_token_id = getattr(config, "image_token_id", None) or getattr(
-            config, "image_token_index", None
-        )
+        self.image_token_id = getattr(
+            self.hf_config, "image_token_id", None
+        ) or getattr(self.hf_config, "image_token_index", None)
     @staticmethod
     def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs):
@@ -297,6 +304,13 @@ class ModelConfig:
             **kwargs,
         )
+    def get_total_num_attention_heads(self) -> int:
+        return self.num_attention_heads
+    def get_num_attention_heads(self, tensor_parallel_size) -> int:
+        total_num_attention_heads = self.num_attention_heads
+        return max(1, total_num_attention_heads // tensor_parallel_size)
     # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
     def get_total_num_kv_heads(self) -> int:
         """Returns the total number of KV heads."""
@@ -401,6 +415,8 @@ class ModelConfig:
             "fbgemm_fp8",
             "w8a8_fp8",
             "petit_nvfp4",
+            "quark",
+            "mxfp4",
         ]
         optimized_quantization_methods = [
             "fp8",
@@ -482,6 +498,23 @@ class ModelConfig:
                     self.quantization,
                 )
+    def _verify_dual_chunk_attention_config(self) -> None:
+        if hasattr(self.hf_config, "dual_chunk_attention_config"):
+            # Try loading the sparse attention config
+            sparse_attn_config = get_sparse_attention_config(self.model_path)
+            if not sparse_attn_config:
+                return
+            self.hf_config.dual_chunk_attention_config["sparse_attention_config"] = (
+                sparse_attn_config
+            )
+            if (
+                "sparse_attention_enabled"
+                not in self.hf_config.dual_chunk_attention_config
+            ):
+                self.hf_config.dual_chunk_attention_config[
+                    "sparse_attention_enabled"
+                ] = True
     def get_hf_eos_token_id(self) -> Optional[Set[int]]:
         eos_ids = getattr(self.hf_config, "eos_token_id", None)
         if eos_ids is not None:
@@ -626,6 +659,8 @@ multimodal_model_archs = [
     "DeepseekVL2ForCausalLM",
     "Gemma3ForConditionalGeneration",
     "Gemma3nForConditionalGeneration",
+    "Glm4vForConditionalGeneration",
+    "Glm4vMoeForConditionalGeneration",
     "Grok1VForCausalLM",
     "Grok1AForCausalLM",
     "LlavaLlamaForCausalLM",

sglang/srt/conversation.py CHANGED Viewed

@@ -30,8 +30,10 @@ import re
 from enum import IntEnum, auto
 from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing_extensions import Literal
 from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
-from sglang.srt.utils import read_system_prompt_from_file
+from sglang.srt.utils import ImageData, read_system_prompt_from_file
 class SeparatorStyle(IntEnum):
@@ -91,7 +93,7 @@ class Conversation:
     video_token: str = "<video>"
     audio_token: str = "<audio>"
-    image_data: Optional[List[str]] = None
+    image_data: Optional[List[ImageData]] = None
     video_data: Optional[List[str]] = None
     modalities: Optional[List[str]] = None
     stop_token_ids: Optional[int] = None
@@ -381,9 +383,9 @@ class Conversation:
         """Append a new message."""
         self.messages.append([role, message])
-    def append_image(self, image: str):
+    def append_image(self, image: str, detail: Literal["auto", "low", "high"]):
         """Append a new image."""
-        self.image_data.append(image)
+        self.image_data.append(ImageData(url=image, detail=detail))
     def append_video(self, video: str):
         """Append a new video."""
@@ -627,7 +629,9 @@ def generate_chat_conv(
                             real_content = image_token + real_content
                         else:
                             real_content += image_token
-                        conv.append_image(content.image_url.url)
+                        conv.append_image(
+                            content.image_url.url, content.image_url.detail
+                        )
                     elif content.type == "video_url":
                         real_content += video_token
                         conv.append_video(content.video_url.url)

sglang/srt/disaggregation/base/conn.py CHANGED Viewed

@@ -25,10 +25,13 @@ class KVArgs:
     gpu_id: int
     # for different tp
     decode_tp_size: int
-    # for pp prefill
-    prefill_pp_size: int
     kv_head_num: int
     page_size: int
+    # for pp prefill
+    prefill_pp_size: int
+    pp_rank: int
+    # for system dp
+    system_dp_rank: int
 class KVPoll:

sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc1py3-none-any.whl