PyPI - sglang - Versions diffs - 0.5.3.post1__py3-none-any.whl → 0.5.3.post2__py3-none-any.whl - Mend

sglang 0.5.3.post1py3-none-any.whl → 0.5.3.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (216) hide show

sglang/bench_one_batch.py +12 -2
sglang/bench_one_batch_server.py +40 -25
sglang/bench_serving.py +110 -21
sglang/compile_deep_gemm.py +3 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +9 -2
sglang/profiler.py +18 -1
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
sglang/srt/compilation/backend.py +431 -0
sglang/srt/compilation/compilation_config.py +19 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +477 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/load_config.py +3 -2
sglang/srt/configs/mamba_utils.py +1 -1
sglang/srt/configs/model_config.py +17 -3
sglang/srt/configs/qwen3_next.py +0 -3
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/constrained/llguidance_backend.py +3 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/xgrammar_backend.py +5 -1
sglang/srt/disaggregation/decode.py +12 -10
sglang/srt/disaggregation/prefill.py +6 -4
sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
sglang/srt/distributed/device_communicators/custom_all_reduce.py +2 -2
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/parallel_state.py +37 -5
sglang/srt/entrypoints/context.py +3 -1
sglang/srt/entrypoints/engine.py +1 -1
sglang/srt/entrypoints/grpc_server.py +172 -28
sglang/srt/entrypoints/harmony_utils.py +2 -1
sglang/srt/entrypoints/http_server.py +2 -1
sglang/srt/entrypoints/openai/protocol.py +3 -0
sglang/srt/entrypoints/openai/serving_base.py +2 -1
sglang/srt/entrypoints/openai/serving_chat.py +3 -2
sglang/srt/entrypoints/openai/serving_completions.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +1 -1
sglang/srt/environ.py +10 -0
sglang/srt/eplb/expert_distribution.py +3 -4
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/function_call/utils.py +2 -1
sglang/srt/grpc/compile_proto.py +2 -2
sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +89 -25
sglang/srt/grpc/sglang_scheduler_pb2.py +38 -38
sglang/srt/grpc/sglang_scheduler_pb2.pyi +2 -4
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +1 -1
sglang/srt/layers/attention/aiter_backend.py +3 -3
sglang/srt/layers/attention/attention_registry.py +2 -0
sglang/srt/layers/attention/base_attn_backend.py +19 -0
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/flashattention_backend.py +12 -7
sglang/srt/layers/attention/flashinfer_backend.py +18 -15
sglang/srt/layers/attention/flashinfer_mla_backend.py +9 -9
sglang/srt/layers/attention/flashmla_backend.py +2 -2
sglang/srt/layers/attention/nsa/nsa_indexer.py +10 -4
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/triton_backend.py +71 -32
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
sglang/srt/layers/attention/trtllm_mla_backend.py +5 -5
sglang/srt/layers/attention/vision.py +3 -3
sglang/srt/layers/communicator.py +8 -5
sglang/srt/layers/layernorm.py +10 -5
sglang/srt/layers/logits_processor.py +13 -13
sglang/srt/layers/moe/cutlass_w4a8_moe.py +196 -0
sglang/srt/layers/moe/ep_moe/layer.py +24 -4
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
sglang/srt/layers/moe/fused_moe_triton/layer.py +0 -2
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +8 -0
sglang/srt/layers/moe/token_dispatcher/deepep.py +13 -4
sglang/srt/layers/moe/token_dispatcher/mooncake.py +394 -0
sglang/srt/layers/moe/utils.py +8 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +3 -5
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +3 -2
sglang/srt/layers/quantization/fp8_kernel.py +35 -8
sglang/srt/layers/quantization/fp8_utils.py +1 -3
sglang/srt/layers/quantization/int8_kernel.py +19 -3
sglang/srt/layers/quantization/modelopt_quant.py +9 -19
sglang/srt/layers/quantization/mxfp4.py +4 -4
sglang/srt/layers/quantization/w4afp8.py +47 -1
sglang/srt/layers/radix_attention.py +59 -9
sglang/srt/layers/rotary_embedding.py +33 -9
sglang/srt/layers/sampler.py +33 -13
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora_manager.py +23 -0
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +40 -16
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
sglang/srt/managers/cache_controller.py +45 -13
sglang/srt/managers/data_parallel_controller.py +123 -27
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +43 -3
sglang/srt/managers/mm_utils.py +2 -2
sglang/srt/managers/multi_tokenizer_mixin.py +17 -0
sglang/srt/managers/overlap_utils.py +82 -7
sglang/srt/managers/schedule_batch.py +180 -469
sglang/srt/managers/schedule_policy.py +12 -0
sglang/srt/managers/scheduler.py +248 -142
sglang/srt/managers/scheduler_metrics_mixin.py +51 -2
sglang/srt/managers/scheduler_output_processor_mixin.py +97 -10
sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
sglang/srt/managers/scheduler_update_weights_mixin.py +19 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +2 -0
sglang/srt/managers/tokenizer_manager.py +45 -7
sglang/srt/managers/tp_worker.py +30 -12
sglang/srt/mem_cache/base_prefix_cache.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +5 -1
sglang/srt/mem_cache/common.py +475 -0
sglang/srt/mem_cache/hicache_storage.py +4 -1
sglang/srt/mem_cache/hiradix_cache.py +16 -3
sglang/srt/mem_cache/mamba_radix_cache.py +995 -0
sglang/srt/mem_cache/memory_pool.py +38 -29
sglang/srt/mem_cache/radix_cache.py +91 -17
sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
sglang/srt/mem_cache/storage/backend_factory.py +2 -2
sglang/srt/mem_cache/storage/eic/eic_storage.py +3 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +4 -2
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +44 -17
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +37 -7
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +16 -1
sglang/srt/mem_cache/swa_radix_cache.py +25 -15
sglang/srt/metrics/collector.py +18 -0
sglang/srt/model_executor/cuda_graph_runner.py +3 -2
sglang/srt/model_executor/forward_batch_info.py +10 -4
sglang/srt/model_executor/model_runner.py +210 -66
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +527 -0
sglang/srt/model_loader/loader.py +114 -39
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +8 -8
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/deepseek_nextn.py +2 -2
sglang/srt/models/deepseek_v2.py +49 -32
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +2 -9
sglang/srt/models/glm4_moe.py +8 -12
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +1 -1
sglang/srt/models/glm4v_moe.py +5 -5
sglang/srt/models/gpt_oss.py +4 -4
sglang/srt/models/grok.py +5 -10
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +3 -1
sglang/srt/models/llama.py +2 -2
sglang/srt/models/longcat_flash.py +3 -7
sglang/srt/models/minicpmo.py +7 -2
sglang/srt/models/mllama4.py +2 -2
sglang/srt/models/qwen2_5_vl.py +1 -1
sglang/srt/models/qwen2_moe.py +4 -4
sglang/srt/models/qwen2_vl.py +1 -1
sglang/srt/models/qwen3_moe.py +4 -4
sglang/srt/models/qwen3_next.py +2 -2
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_vl.py +9 -10
sglang/srt/models/qwen3_vl_moe.py +6 -15
sglang/srt/models/step3_vl.py +2 -3
sglang/srt/sampling/custom_logit_processor.py +2 -1
sglang/srt/sampling/sampling_batch_info.py +6 -13
sglang/srt/sampling/sampling_params.py +70 -2
sglang/srt/server_args.py +245 -31
sglang/srt/single_batch_overlap.py +0 -1
sglang/srt/speculative/draft_utils.py +210 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -6
sglang/srt/speculative/eagle_info.py +53 -17
sglang/srt/speculative/eagle_info_v2.py +404 -0
sglang/srt/speculative/eagle_utils.py +138 -0
sglang/srt/speculative/eagle_worker.py +55 -223
sglang/srt/speculative/eagle_worker_v2.py +484 -0
sglang/srt/speculative/ngram_info.py +14 -9
sglang/srt/speculative/spec_utils.py +1 -1
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/two_batch_overlap.py +22 -13
sglang/srt/utils/__init__.py +1 -1
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/utils/common.py +77 -24
sglang/srt/utils/hf_transformers_utils.py +2 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/utils/profile_merger.py +199 -0
sglang/test/run_eval.py +1 -0
sglang/test/runners.py +2 -0
sglang/test/simple_eval_common.py +3 -0
sglang/test/simple_eval_longbench_v2.py +33 -21
sglang/test/test_cutlass_moe.py +1 -1
sglang/test/test_deterministic.py +2 -5
sglang/test/test_deterministic_utils.py +3 -3
sglang/test/test_utils.py +5 -6
sglang/version.py +1 -1
{sglang-0.5.3.post1.dist-info → sglang-0.5.3.post2.dist-info}/METADATA +23 -11
{sglang-0.5.3.post1.dist-info → sglang-0.5.3.post2.dist-info}/RECORD +215 -192
sglang/srt/speculative/build_eagle_tree.py +0 -427
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -0
/sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
{sglang-0.5.3.post1.dist-info → sglang-0.5.3.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.3.post1.dist-info → sglang-0.5.3.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3.post1.dist-info → sglang-0.5.3.post2.dist-info}/top_level.txt +0 -0

sglang/bench_one_batch.py CHANGED Viewed

@@ -51,6 +51,7 @@ import logging
 import multiprocessing
 import os
 import time
+from types import SimpleNamespace
 from typing import Tuple
 import numpy as np
@@ -257,11 +258,18 @@ def prepare_synthetic_inputs_for_latency_test(
 @torch.no_grad
 def extend(reqs, model_runner):
+    # Create dummy tree_cache for benchmarks (no prefix caching, just allocation)
+    dummy_tree_cache = SimpleNamespace(
+        page_size=1,
+        device=model_runner.device,
+        token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
+    )
     batch = ScheduleBatch.init_new(
         reqs=reqs,
         req_to_token_pool=model_runner.req_to_token_pool,
         token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
-        tree_cache=None,
+        tree_cache=dummy_tree_cache,
         model_config=model_runner.model_config,
         enable_overlap=False,
         spec_algorithm=SpeculativeAlgorithm.NONE,
@@ -510,7 +518,9 @@ def latency_test(
     # Set CPU affinity
     if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
-        set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
+        set_gpu_proc_affinity(
+            server_args.pp_size, server_args.tp_size, server_args.nnodes, tp_rank
+        )
     # Configure the logger
     configure_logger(server_args, prefix=f" TP{tp_rank}")

sglang/bench_one_batch_server.py CHANGED Viewed

@@ -25,8 +25,10 @@ from typing import List, Optional, Tuple
 import numpy as np
 import requests
 from pydantic import BaseModel
+from transformers import AutoProcessor, PreTrainedTokenizer
 from sglang.bench_serving import (
+    get_processor,
     get_tokenizer,
     sample_mmmu_requests,
     sample_random_requests,
@@ -104,8 +106,14 @@ Note: To view the traces through perfetto-ui, please:
             if self.profile_links.extend or self.profile_links.decode:
                 # Create a combined link or use the first available one
                 trace_files = [self.profile_links.extend, self.profile_links.decode]
+                if any(trace_file is None for trace_file in trace_files):
+                    logger.error("Some trace files are None", f"{trace_files=}")
                 trace_files_relay_links = [
-                    f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
+                    (
+                        f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
+                        if trace_file
+                        else "N/A"
+                    )
                     for trace_file in trace_files
                 ]
@@ -114,30 +122,31 @@ Note: To view the traces through perfetto-ui, please:
         # Build the row
         return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
-    @classmethod
-    def generate_markdown_report(
-        cls, trace_dir, results: List["BenchmarkResult"]
-    ) -> str:
-        """Generate a markdown report from a list of BenchmarkResult object from a single run."""
-        import os
-        summary = f"### {results[0].model_path}\n"
+def generate_markdown_report(trace_dir, results: List["BenchmarkResult"]) -> str:
+    """Generate a markdown report from a list of BenchmarkResult object from a single run."""
+    import os
+    summary = f"### {results[0].model_path}\n"
-        # summary += (
-        #     f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
-        # )
-        summary += "| batch size | input len | latency (s) | input throughput (tok/s)  | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
-        summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
+    # summary += (
+    #     f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
+    # )
+    summary += "| batch size | input len | latency (s) | input throughput (tok/s)  | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
+    summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
-        # all results should share the same isl & osl
-        for result in results:
-            base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
-            relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/")
-            relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
-            # base_url = "https://github.com/sgl-project/ci-data/traces"
-            summary += result.to_markdown_row(trace_dir, base_url, relay_base)
+    # all results should share the same isl & osl
+    for result in results:
+        base_url = os.getenv(
+            "TRACE_BASE_URL", "https://github.com/sgl-project/ci-data/traces"
+        ).rstrip("/")
+        relay_base = os.getenv(
+            "PERFETTO_RELAY_URL",
+            "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html",
+        ).rstrip("/")
+        summary += result.to_markdown_row(trace_dir, base_url, relay_base)
-        return summary
+    return summary
 @dataclasses.dataclass
@@ -288,7 +297,7 @@ def run_one_case(
     input_len_step_percentage: float,
     run_name: str,
     result_filename: str,
-    tokenizer,
+    tokenizer: PreTrainedTokenizer | AutoProcessor,
     dataset_name="",
     profile: bool = False,
     profile_steps: int = 3,
@@ -302,9 +311,8 @@ def run_one_case(
     if dataset_name == "mmmu":
         input_requests = sample_mmmu_requests(
             num_requests=batch_size,
-            tokenizer=tokenizer,
+            processor=tokenizer,
             fixed_output_len=output_len,
-            apply_chat_template=True,
             random_sample=False,
         )
     elif dataset_name == "random":
@@ -364,6 +372,8 @@ def run_one_case(
     if dataset_name == "mmmu":
         # vlm
         input_ids = []
+        # for vlms, tokenizer is an instance of AutoProcessor
+        tokenizer = tokenizer.tokenizer
         for input_req in input_requests:
             input_ids += [tokenizer.encode(input_req.prompt)]
         payload["image_data"] = [req.image_data for req in input_requests]
@@ -609,7 +619,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
         tokenizer_path = server_info["tokenizer_path"]
     elif "prefill" in server_info:
         tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
-    tokenizer = get_tokenizer(tokenizer_path)
+    if bench_args.dataset_name == "mmmu":
+        # mmmu implies this is a MLLM
+        tokenizer = get_processor(tokenizer_path)
+    else:
+        tokenizer = get_tokenizer(tokenizer_path)
     # warmup
     if not bench_args.skip_warmup:

sglang/bench_serving.py CHANGED Viewed

@@ -12,7 +12,6 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
 import argparse
 import asyncio
-import base64
 import io
 import json
 import os
@@ -623,6 +622,48 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput:
     return output
+def _build_profile_urls(
+    profile_prefill_url: Optional[List[str]],
+    profile_decode_url: Optional[List[str]],
+) -> List[Tuple[str, str]]:
+    """Build profile URLs list from prefill/decode URL arguments.
+    Returns:
+        List of (worker_type, url) tuples. e.g., [("Prefill-0", "http://..."), ("Decode-0", "http://...")]
+    """
+    profile_urls = []
+    if profile_prefill_url:
+        for idx, url in enumerate(profile_prefill_url):
+            profile_urls.append((f"Prefill-{idx}", url))
+    if profile_decode_url:
+        for idx, url in enumerate(profile_decode_url):
+            profile_urls.append((f"Decode-{idx}", url))
+    return profile_urls
+async def _call_profile_pd(profile_urls: List[Tuple[str, str]], mode: str) -> None:
+    """Call profile endpoint (start/stop) on PD separated workers.
+    Args:
+        profile_urls: List of (worker_type, url) tuples
+        mode: "start" or "stop"
+    """
+    endpoint = "/start_profile" if mode == "start" else "/stop_profile"
+    action = "Starting" if mode == "start" else "Stopping"
+    action_past = "started" if mode == "start" else "stopped"
+    print(f"{action} profiler...")
+    for worker_type, url in profile_urls:
+        profile_output = await async_request_profile(api_url=url + endpoint)
+        if profile_output.success:
+            print(f"Profiler {action_past} for {worker_type} worker at {url}")
+        else:
+            print(
+                f"Failed to {mode} profiler for {worker_type} worker at {url}: {profile_output.error}"
+            )
 def get_model(pretrained_model_name_or_path: str) -> str:
     if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
         import huggingface_hub.constants
@@ -671,7 +712,7 @@ def get_processor(
     if pretrained_model_name_or_path.endswith(
         ".json"
     ) or pretrained_model_name_or_path.endswith(".model"):
-        from sglang.srt.hf_transformers_utils import get_processor
+        from sglang.srt.utils.hf_transformers_utils import get_processor
         return get_processor(pretrained_model_name_or_path)
@@ -935,7 +976,7 @@ async def get_mooncake_request_over_time(
         for i in range(num_rounds):
             # Add user query for the current round
             chat_history.append(
-                {"role": "user", "content": f"Round {i+1}: {user_query_base}"}
+                {"role": "user", "content": f"Round {i + 1}: {user_query_base}"}
             )
             # Form the full prompt from history
@@ -964,7 +1005,7 @@ async def get_mooncake_request_over_time(
 def sample_mmmu_requests(
     num_requests: int,
-    processor: AutoProcessor,
+    processor: AutoProcessor | AutoTokenizer,
     fixed_output_len: Optional[int] = None,
     random_sample: bool = True,
 ) -> List[DatasetRow]:
@@ -973,9 +1014,7 @@ def sample_mmmu_requests(
     Args:
         num_requests: Number of requests to sample.
-        tokenizer: Tokenizer to use for token counting.
         fixed_output_len: If provided, use this fixed output length for all requests.
-        apply_chat_template: Whether to apply the chat template to the prompt.
         random_sample: Whether to randomly sample or take the first N.
     Returns:
@@ -1282,11 +1321,11 @@ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
     )
-def create_mm_data_row(text_prompt, images, images_base64, output_len, processor):
+def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor):
     try:
         content_items = [
-            {"type": "image_url", "image_url": {"url": img_url}}
-            for img_url in images_base64
+            {"type": "image", "image": {"url": image_base64}}
+            for image_base64 in images_base64
         ]
         content_items.append({"type": "text", "text": text_prompt})
         prompt_str = processor.apply_chat_template(
@@ -1294,7 +1333,9 @@ def create_mm_data_row(text_prompt, images, images_base64, output_len, processor
             add_generation_prompt=True,
             tokenize=False,
         )
-    except Exception:
+    except Exception as e:
+        # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
+        print(f"Error applying chat template: {e}, fallback to <image> tag")
         # Some tokenizers do not support list content; fall back to a placeholder in the text
         prompt_str = f"<image>{text_prompt}"
@@ -1425,7 +1466,7 @@ def sample_image_requests(
     print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
     print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
     print(
-        f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes//num_requests} bytes per request"
+        f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
     )
     return dataset
@@ -1676,6 +1717,8 @@ async def benchmark(
     use_trace_timestamps: bool = False,
     mooncake_slowdown_factor=1.0,
     mooncake_num_rounds=1,
+    profile_prefill_url: Optional[List[str]] = None,
+    profile_decode_url: Optional[List[str]] = None,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -1765,14 +1808,28 @@ async def benchmark(
     time.sleep(1.0)
+    # Build profile URLs for PD separated mode (do this once at the beginning)
+    pd_profile_urls = []
+    if profile and pd_separated:
+        pd_profile_urls = _build_profile_urls(profile_prefill_url, profile_decode_url)
+        if not pd_profile_urls:
+            print(
+                "Warning: PD separated mode requires --profile-prefill-url or --profile-decode-url"
+            )
+            print("Skipping profiler start. Please specify worker URLs for profiling.")
     # Start profiler
     if profile:
-        print("Starting profiler...")
-        profile_output = await async_request_profile(
-            api_url=base_url + "/start_profile"
-        )
-        if profile_output.success:
-            print("Profiler started")
+        if pd_separated:
+            if pd_profile_urls:
+                await _call_profile_pd(pd_profile_urls, "start")
+        else:
+            print("Starting profiler...")
+            profile_output = await async_request_profile(
+                api_url=base_url + "/start_profile"
+            )
+            if profile_output.success:
+                print("Profiler started")
     # Run all requests
     benchmark_start_time = time.perf_counter()
@@ -1821,10 +1878,16 @@ async def benchmark(
     # Stop profiler
     if profile:
-        print("Stopping profiler...")
-        profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
-        if profile_output.success:
-            print("Profiler stopped")
+        if pd_separated:
+            if pd_profile_urls:
+                await _call_profile_pd(pd_profile_urls, "stop")
+        else:
+            print("Stopping profiler...")
+            profile_output = await async_request_profile(
+                api_url=base_url + "/stop_profile"
+            )
+            if profile_output.success:
+                print("Profiler stopped")
     if pbar is not None:
         pbar.close()
@@ -2205,6 +2268,8 @@ def run_benchmark(args_: argparse.Namespace):
             use_trace_timestamps=args.use_trace_timestamps,
             mooncake_slowdown_factor=args.mooncake_slowdown_factor,
             mooncake_num_rounds=args.mooncake_num_rounds,
+            profile_prefill_url=getattr(args, "profile_prefill_url", None),
+            profile_decode_url=getattr(args, "profile_decode_url", None),
         )
     )
@@ -2430,6 +2495,30 @@ if __name__ == "__main__":
         action="store_true",
         help="Benchmark PD disaggregation server",
     )
+    # Create a mutually exclusive group for profiling URLs
+    # In PD separated mode, prefill and decode workers must be profiled separately
+    profile_url_group = parser.add_mutually_exclusive_group()
+    profile_url_group.add_argument(
+        "--profile-prefill-url",
+        type=str,
+        nargs="*",
+        default=None,
+        help="URL(s) of the prefill worker(s) for profiling in PD separated mode. "
+        "Can specify multiple URLs: --profile-prefill-url http://localhost:30000 http://localhost:30001. "
+        "NOTE: Cannot be used together with --profile-decode-url. "
+        "In PD separated mode, prefill and decode workers must be profiled separately.",
+    )
+    profile_url_group.add_argument(
+        "--profile-decode-url",
+        type=str,
+        nargs="*",
+        default=None,
+        help="URL(s) of the decode worker(s) for profiling in PD separated mode. "
+        "Can specify multiple URLs: --profile-decode-url http://localhost:30010 http://localhost:30011. "
+        "NOTE: Cannot be used together with --profile-prefill-url. "
+        "In PD separated mode, prefill and decode workers must be profiled separately.",
+    )
     parser.add_argument(
         "--flush-cache",
         action="store_true",

sglang/compile_deep_gemm.py CHANGED Viewed

@@ -19,6 +19,7 @@ import requests
 from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
 from sglang.srt.entrypoints.http_server import launch_server
+from sglang.srt.environ import envs
 from sglang.srt.managers.io_struct import GenerateReqInput
 from sglang.srt.managers.tokenizer_manager import TokenizerManager
 from sglang.srt.server_args import ServerArgs
@@ -28,9 +29,9 @@ from sglang.srt.warmup import warmup
 multiprocessing.set_start_method("spawn", force=True)
 # Reduce warning
-os.environ["SGL_IN_DEEPGEMM_PRECOMPILE_STAGE"] = "1"
+envs.SGLANG_IN_DEEPGEMM_PRECOMPILE_STAGE.set(True)
 # Force enable deep gemm
-os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "1"
+envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(True)
 # Force enable mha chunked kv for DeepSeek V3 to avoid missing kv_b_proj DeepGEMM case
 os.environ["SGL_CHUNKED_PREFIX_CACHE_THRESHOLD"] = "0"

sglang/global_config.py CHANGED Viewed

@@ -1,14 +1,11 @@
 """Global configurations"""
-import os
+# FIXME: deprecate this file and move all usage to sglang.srt.environ or sglang.__init__.py
 class GlobalConfig:
     """
     Store some global constants.
-    See also python/sglang/srt/managers/schedule_batch.py::global_server_args_dict, which stores
-    many global runtime arguments as well.
     """
     def __init__(self):
@@ -20,27 +17,6 @@ class GlobalConfig:
         # Default backend of the language
         self.default_backend = None
-        # Runtime constants: New generation token ratio estimation
-        self.default_init_new_token_ratio = float(
-            os.environ.get("SGLANG_INIT_NEW_TOKEN_RATIO", 0.7)
-        )
-        self.default_min_new_token_ratio_factor = float(
-            os.environ.get("SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR", 0.14)
-        )
-        self.default_new_token_ratio_decay_steps = float(
-            os.environ.get("SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS", 600)
-        )
-        self.torch_empty_cache_interval = float(
-            os.environ.get(
-                "SGLANG_EMPTY_CACHE_INTERVAL", -1
-            )  # in seconds. Set if you observe high memory accumulation over a long serving period.
-        )
-        # Runtime constants: others
-        self.retract_decode_steps = 20
-        self.flashinfer_workspace_size = int(
-            os.environ.get("FLASHINFER_WORKSPACE_SIZE", 384 * 1024 * 1024)
-        )
         # Output tokenization configs
         self.skip_special_tokens_in_output = True
         self.spaces_between_special_tokens_in_out = True

sglang/lang/api.py CHANGED Viewed

@@ -79,6 +79,7 @@ def gen(
     n: Optional[int] = None,
     stop: Optional[Union[str, List[str]]] = None,
     stop_token_ids: Optional[List[int]] = None,
+    stop_regex: Optional[Union[str, List[str]]] = None,
     temperature: Optional[float] = None,
     top_p: Optional[float] = None,
     top_k: Optional[int] = None,
@@ -120,6 +121,7 @@ def gen(
         n,
         stop,
         stop_token_ids,
+        stop_regex,
         temperature,
         top_p,
         top_k,
@@ -143,6 +145,7 @@ def gen_int(
     n: Optional[int] = None,
     stop: Optional[Union[str, List[str]]] = None,
     stop_token_ids: Optional[List[int]] = None,
+    stop_regex: Optional[Union[str, List[str]]] = None,
     temperature: Optional[float] = None,
     top_p: Optional[float] = None,
     top_k: Optional[int] = None,
@@ -162,6 +165,7 @@ def gen_int(
         n,
         stop,
         stop_token_ids,
+        stop_regex,
         temperature,
         top_p,
         top_k,
@@ -184,6 +188,7 @@ def gen_string(
     n: Optional[int] = None,
     stop: Optional[Union[str, List[str]]] = None,
     stop_token_ids: Optional[List[int]] = None,
+    stop_regex: Optional[Union[str, List[str]]] = None,
     temperature: Optional[float] = None,
     top_p: Optional[float] = None,
     top_k: Optional[int] = None,
@@ -203,6 +208,7 @@ def gen_string(
         n,
         stop,
         stop_token_ids,
+        stop_regex,
         temperature,
         top_p,
         top_k,

sglang/lang/interpreter.py CHANGED Viewed

@@ -792,6 +792,7 @@ class StreamExecutor:
             "n",
             "stop",
             "stop_token_ids",
+            "stop_regex",
             "temperature",
             "top_p",
             "top_k",

sglang/lang/ir.py CHANGED Viewed

@@ -21,6 +21,7 @@ class SglSamplingParams:
     n: int = 1
     stop: Union[str, List[str]] = ()
     stop_token_ids: Optional[List[int]] = ()
+    stop_regex: Optional[Union[str, List[str]]] = ()
     temperature: float = 1.0
     top_p: float = 1.0
     top_k: int = -1  # -1 means disable
@@ -45,6 +46,7 @@ class SglSamplingParams:
             self.n,
             self.stop,
             self.stop_token_ids,
+            self.stop_regex,
             self.temperature,
             self.top_p,
             self.top_k,
@@ -123,6 +125,7 @@ class SglSamplingParams:
             "n": self.n,
             "stop": self.stop,
             "stop_token_ids": self.stop_token_ids,
+            "stop_regex": self.stop_regex,
             "temperature": self.temperature,
             "top_p": self.top_p,
             "top_k": self.top_k,
@@ -161,6 +164,7 @@ class SglFunction:
         n: int = 1,
         stop: Optional[Union[str, List[str]]] = None,
         stop_token_ids: Optional[List[int]] = None,
+        stop_regex: Optional[Union[str, List[str]]] = None,
         temperature: float = 1.0,
         top_p: float = 1.0,
         top_k: int = -1,
@@ -184,12 +188,15 @@ class SglFunction:
             stop = []
         if stop_token_ids is None:
             stop_token_ids = []
+        if stop_regex is None:
+            stop_regex = []
         default_sampling_para = SglSamplingParams(
             max_new_tokens=max_new_tokens,
             n=n,
             stop=stop,
             stop_token_ids=stop_token_ids,
+            stop_regex=stop_regex,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
@@ -221,6 +228,7 @@ class SglFunction:
         n: int = 1,
         stop: Optional[Union[str, List[str]]] = None,
         stop_token_ids: Optional[List[int]] = None,
+        stop_regex: Optional[Union[str, List[str]]] = None,
         temperature: float = 1.0,
         top_p: float = 1.0,
         top_k: int = -1,
@@ -243,6 +251,8 @@ class SglFunction:
             stop = []
         if stop_token_ids is None:
             stop_token_ids = []
+        if stop_regex is None:
+            stop_regex = []
         assert isinstance(batch_kwargs, (list, tuple))
         if len(batch_kwargs) == 0:
@@ -267,6 +277,7 @@ class SglFunction:
             n=n,
             stop=stop,
             stop_token_ids=stop_token_ids,
+            stop_regex=stop_regex,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
@@ -451,6 +462,7 @@ class SglGen(SglExpr):
         n: Optional[int] = None,
         stop: Optional[Union[str, List[str]]] = None,
         stop_token_ids: Optional[List[int]] = None,
+        stop_regex: Optional[Union[str, List[str]]] = None,
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
         top_k: Optional[int] = None,
@@ -474,6 +486,7 @@ class SglGen(SglExpr):
             min_new_tokens=min_new_tokens,
             n=n,
             stop=stop,
+            stop_regex=stop_regex,
             stop_token_ids=stop_token_ids,
             temperature=temperature,
             top_p=top_p,

sglang/launch_server.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """Launch the inference server."""
+import asyncio
 import os
 import sys
-from sglang.srt.entrypoints.http_server import launch_server
 from sglang.srt.server_args import prepare_server_args
 from sglang.srt.utils import kill_process_tree
@@ -11,6 +11,13 @@ if __name__ == "__main__":
     server_args = prepare_server_args(sys.argv[1:])
     try:
-        launch_server(server_args)
+        if server_args.grpc_mode:
+            from sglang.srt.entrypoints.grpc_server import serve_grpc
+            asyncio.run(serve_grpc(server_args))
+        else:
+            from sglang.srt.entrypoints.http_server import launch_server
+            launch_server(server_args)
     finally:
         kill_process_tree(os.getpid(), include_parent=False)

sglang/profiler.py CHANGED Viewed

@@ -25,6 +25,7 @@ def _run_profile(
     output_dir: Optional[str] = None,
     profile_name: Optional[str] = None,
     profile_by_stage: bool = False,
+    merge_profiles: bool = False,
 ) -> str:
     if output_dir is None:
         output_dir = PROFILER_DIR
@@ -60,6 +61,7 @@ def _run_profile(
         "num_steps": str(num_steps),
         "activities": activities,
         "profile_by_stage": profile_by_stage,
+        "merge_profiles": merge_profiles,
     }
     response = requests.post(url=url + "/start_profile", json=json_data)
@@ -76,10 +78,17 @@ def run_profile(
     output_dir: Optional[str] = None,
     profile_name: Optional[str] = None,
     profile_by_stage: bool = False,
+    merge_profiles: bool = False,
 ):
     # step based profile will self terminate on num_steps constraints
     link = _run_profile(
-        url, num_steps, activities, output_dir, profile_name, profile_by_stage
+        url,
+        num_steps,
+        activities,
+        output_dir,
+        profile_name,
+        profile_by_stage,
+        merge_profiles,
     )
     return link
@@ -145,6 +154,13 @@ if __name__ == "__main__":
         default=False,
         help="Whether to use rpd profiler (https://github.com/ROCm/rocmProfileData)",
     )
+    parser.add_argument(
+        "--merge-profiles",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help="Whether to merge profiles from all ranks into a single trace file",
+    )
     args = parser.parse_args()
     activities = []
@@ -163,4 +179,5 @@ if __name__ == "__main__":
         args.output_dir,
         args.profile_name,
         args.profile_by_stage,
+        args.merge_profiles,
     )

sglang 0.5.3.post1__py3-none-any.whl → 0.5.3.post2__py3-none-any.whl

sglang 0.5.3.post1py3-none-any.whl → 0.5.3.post2py3-none-any.whl