PyPI - sglang - Versions diffs - 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl - Mend

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

sglang/bench_offline_throughput.py +10 -8
sglang/bench_one_batch.py +7 -6
sglang/bench_one_batch_server.py +157 -21
sglang/bench_serving.py +137 -59
sglang/compile_deep_gemm.py +5 -5
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +78 -78
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +2 -2
sglang/srt/configs/model_config.py +40 -28
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +69 -43
sglang/srt/conversation.py +49 -44
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +129 -135
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
sglang/srt/disaggregation/fake/conn.py +3 -13
sglang/srt/disaggregation/kv_events.py +357 -0
sglang/srt/disaggregation/mini_lb.py +57 -24
sglang/srt/disaggregation/mooncake/conn.py +238 -122
sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
sglang/srt/disaggregation/nixl/conn.py +10 -19
sglang/srt/disaggregation/prefill.py +132 -47
sglang/srt/disaggregation/utils.py +123 -6
sglang/srt/distributed/utils.py +3 -3
sglang/srt/entrypoints/EngineBase.py +5 -0
sglang/srt/entrypoints/engine.py +44 -9
sglang/srt/entrypoints/http_server.py +23 -6
sglang/srt/entrypoints/http_server_engine.py +5 -2
sglang/srt/function_call/base_format_detector.py +250 -0
sglang/srt/function_call/core_types.py +34 -0
sglang/srt/function_call/deepseekv3_detector.py +157 -0
sglang/srt/function_call/ebnf_composer.py +234 -0
sglang/srt/function_call/function_call_parser.py +175 -0
sglang/srt/function_call/llama32_detector.py +74 -0
sglang/srt/function_call/mistral_detector.py +84 -0
sglang/srt/function_call/pythonic_detector.py +163 -0
sglang/srt/function_call/qwen25_detector.py +67 -0
sglang/srt/function_call/utils.py +35 -0
sglang/srt/hf_transformers_utils.py +46 -7
sglang/srt/layers/attention/aiter_backend.py +513 -0
sglang/srt/layers/attention/flashattention_backend.py +64 -18
sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
sglang/srt/layers/attention/flashmla_backend.py +340 -78
sglang/srt/layers/attention/triton_backend.py +3 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/utils.py +6 -4
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/communicator.py +451 -0
sglang/srt/layers/dp_attention.py +61 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/cutlass_moe.py +207 -0
sglang/srt/layers/moe/ep_moe/kernels.py +34 -12
sglang/srt/layers/moe/ep_moe/layer.py +105 -51
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
sglang/srt/layers/moe/topk.py +67 -10
sglang/srt/layers/multimodal.py +70 -0
sglang/srt/layers/quantization/__init__.py +8 -3
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/deep_gemm.py +77 -74
sglang/srt/layers/quantization/fp8.py +92 -2
sglang/srt/layers/quantization/fp8_kernel.py +3 -3
sglang/srt/layers/quantization/fp8_utils.py +6 -0
sglang/srt/layers/quantization/gptq.py +298 -6
sglang/srt/layers/quantization/int8_kernel.py +20 -7
sglang/srt/layers/quantization/qoq.py +244 -0
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +2 -4
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/deepseek_eplb.py +278 -0
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/eplb_manager.py +55 -0
sglang/srt/managers/expert_distribution.py +704 -56
sglang/srt/managers/expert_location.py +394 -0
sglang/srt/managers/expert_location_dispatch.py +91 -0
sglang/srt/managers/io_struct.py +19 -4
sglang/srt/managers/mm_utils.py +294 -140
sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
sglang/srt/managers/multimodal_processors/internvl.py +14 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
sglang/srt/managers/schedule_batch.py +122 -42
sglang/srt/managers/schedule_policy.py +1 -5
sglang/srt/managers/scheduler.py +205 -138
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +232 -58
sglang/srt/managers/tp_worker.py +12 -9
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/base_prefix_cache.py +3 -0
sglang/srt/mem_cache/chunk_cache.py +3 -1
sglang/srt/mem_cache/hiradix_cache.py +4 -4
sglang/srt/mem_cache/memory_pool.py +76 -52
sglang/srt/mem_cache/multimodal_cache.py +45 -0
sglang/srt/mem_cache/radix_cache.py +58 -5
sglang/srt/metrics/collector.py +314 -39
sglang/srt/mm_utils.py +10 -0
sglang/srt/model_executor/cuda_graph_runner.py +29 -19
sglang/srt/model_executor/expert_location_updater.py +422 -0
sglang/srt/model_executor/forward_batch_info.py +5 -1
sglang/srt/model_executor/model_runner.py +163 -68
sglang/srt/model_loader/loader.py +10 -6
sglang/srt/models/clip.py +5 -1
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +308 -351
sglang/srt/models/exaone.py +8 -3
sglang/srt/models/gemma3_mm.py +70 -33
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llama4.py +15 -8
sglang/srt/models/llava.py +258 -7
sglang/srt/models/mimo_mtp.py +220 -0
sglang/srt/models/minicpmo.py +5 -12
sglang/srt/models/mistral.py +71 -1
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/mllama.py +3 -3
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/qwen2.py +95 -26
sglang/srt/models/qwen2_5_vl.py +8 -0
sglang/srt/models/qwen2_moe.py +330 -60
sglang/srt/models/qwen2_vl.py +6 -0
sglang/srt/models/qwen3.py +52 -10
sglang/srt/models/qwen3_moe.py +411 -48
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/siglip.py +294 -0
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/openai_api/adapter.py +58 -20
sglang/srt/openai_api/protocol.py +6 -8
sglang/srt/operations.py +154 -0
sglang/srt/operations_strategy.py +31 -0
sglang/srt/reasoning_parser.py +3 -3
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +4 -56
sglang/srt/sampling/sampling_params.py +2 -2
sglang/srt/server_args.py +162 -22
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +138 -7
sglang/srt/speculative/eagle_worker.py +69 -21
sglang/srt/utils.py +74 -17
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_cutlass_moe.py +278 -0
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +55 -14
sglang/utils.py +3 -3
sglang/version.py +1 -1
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +23 -13
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +178 -149
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
sglang/srt/function_call_parser.py +0 -858
sglang/srt/platforms/interface.py +0 -371
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
/sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0

sglang/srt/metrics/collector.py CHANGED Viewed

@@ -15,7 +15,119 @@
 import time
 from dataclasses import dataclass
-from typing import Dict, Union
+from enum import Enum
+from typing import Dict, List, Optional, Union
+from sglang.srt.utils import get_bool_env_var
+SGLANG_TEST_REQUEST_TIME_STATS = get_bool_env_var("SGLANG_TEST_REQUEST_TIME_STATS")
+@dataclass
+class TimeStats:
+    """
+    Store the timestamps for each stage of a request.
+    Unified: wait_queue -> forward -> completion
+    Prefill: bootstrap_queue -> wait_queue -> forward -> transfer_queue -> completion
+    Decode: prealloc_queue -> transfer_queue -> wait_queue -> forward -> completion
+    """
+    lb_entry_time: float = 0.0
+    wait_queue_entry_time: float = 0.0
+    forward_entry_time: float = 0.0
+    completion_time: float = 0.0
+    prefill_bootstrap_queue_entry_time: float = 0.0
+    prefill_transfer_queue_entry_time: float = 0.0
+    decode_prealloc_queue_entry_time: float = 0.0
+    decode_transfer_queue_entry_time: float = 0.0
+    class RequestType(Enum):
+        UNIFIED = "unified"
+        PREFILL = "prefill"
+        DECODE = "decode"
+        INVALID = "invalid"
+    def __str__(self) -> str:
+        # if unified
+        _type = self.get_type()
+        if _type == self.RequestType.UNIFIED:
+            queue_duration = self.forward_entry_time - self.wait_queue_entry_time
+            forward_duration = self.completion_time - self.forward_entry_time
+            if SGLANG_TEST_REQUEST_TIME_STATS:
+                assert (
+                    queue_duration >= 0 and forward_duration >= 0
+                ), f"queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
+            return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time}"
+        elif _type == self.RequestType.PREFILL:
+            bootstrap_duration = (
+                self.wait_queue_entry_time - self.prefill_bootstrap_queue_entry_time
+            )
+            queue_duration = self.forward_entry_time - self.wait_queue_entry_time
+            forward_duration = self.completion_time - self.forward_entry_time
+            if SGLANG_TEST_REQUEST_TIME_STATS:
+                assert (
+                    bootstrap_duration >= 0
+                    and queue_duration >= 0
+                    and forward_duration >= 0
+                ), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
+            return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time}"
+        # if decode
+        elif _type == self.RequestType.DECODE:
+            prealloc_duration = (
+                self.decode_transfer_queue_entry_time
+                - self.decode_prealloc_queue_entry_time
+            )
+            transfer_duration = (
+                self.wait_queue_entry_time - self.decode_transfer_queue_entry_time
+            )
+            queue_duration = self.forward_entry_time - self.wait_queue_entry_time
+            forward_duration = self.completion_time - self.forward_entry_time
+            if SGLANG_TEST_REQUEST_TIME_STATS:
+                assert (
+                    prealloc_duration >= 0
+                    and transfer_duration >= 0
+                    and queue_duration >= 0
+                    and forward_duration >= 0
+                ), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
+            return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time}"
+        else:
+            return "Invalid Time Stats"
+    def format_duration(self, duration: float) -> str:
+        return f"{duration * 1e3:.2f}ms"
+    def get_type(self) -> RequestType:
+        """Determine the type of request based on timestamp values."""
+        if (
+            self.prefill_bootstrap_queue_entry_time == 0.0
+            and self.prefill_transfer_queue_entry_time == 0.0
+            and self.decode_prealloc_queue_entry_time == 0.0
+            and self.decode_transfer_queue_entry_time == 0.0
+        ):
+            return self.RequestType.UNIFIED
+        elif (
+            self.prefill_bootstrap_queue_entry_time > 0.0
+            and self.prefill_transfer_queue_entry_time > 0.0
+        ):
+            return self.RequestType.PREFILL
+        elif (
+            self.decode_prealloc_queue_entry_time > 0.0
+            and self.decode_transfer_queue_entry_time > 0.0
+            and self.wait_queue_entry_time > 0.0
+        ):
+            return self.RequestType.DECODE
+        else:
+            return self.RequestType.INVALID
 @dataclass
@@ -26,18 +138,23 @@ class SchedulerStats:
     gen_throughput: float = 0.0
     num_queue_reqs: int = 0
     cache_hit_rate: float = 0.0
+    num_grammar_queue_reqs: int = 0
     spec_accept_length: float = 0.0
     avg_request_queue_latency: float = 0.0
+    num_prefill_prealloc_queue_reqs: int = 0
+    num_prefill_infight_queue_reqs: int = 0
+    num_decode_prealloc_queue_reqs: int = 0
+    num_decode_transfer_queue_reqs: int = 0
 class SchedulerMetricsCollector:
     def __init__(self, labels: Dict[str, str]) -> None:
         # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
-        from prometheus_client import Gauge, Histogram
+        from prometheus_client import Counter, Gauge
         self.labels = labels
-        self.last_log_time = time.time()
+        self.last_log_time = time.perf_counter()
         self.num_running_reqs = Gauge(
             name="sglang:num_running_reqs",
@@ -74,6 +191,13 @@ class SchedulerMetricsCollector:
             multiprocess_mode="mostrecent",
         )
+        self.num_grammar_queue_reqs = Gauge(
+            name="sglang:num_grammar_queue_reqs",
+            documentation="The number of requests in the grammar waiting queue.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
         self.cache_hit_rate = Gauge(
             name="sglang:cache_hit_rate",
             documentation="The prefix cache hit rate.",
@@ -95,28 +219,98 @@ class SchedulerMetricsCollector:
             multiprocess_mode="mostrecent",
         )
+        # Disaggregation queue metrics
+        self.num_prefill_prealloc_queue_reqs = Gauge(
+            name="sglang:num_prefill_prealloc_queue_reqs",
+            documentation="The number of requests in the prefill prealloc queue.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.num_prefill_infight_queue_reqs = Gauge(
+            name="sglang:num_prefill_infight_queue_reqs",
+            documentation="The number of requests in the prefill infight queue.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.num_decode_prealloc_queue_reqs = Gauge(
+            name="sglang:num_decode_prealloc_queue_reqs",
+            documentation="The number of requests in the decode prealloc queue.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.num_decode_transfer_queue_reqs = Gauge(
+            name="sglang:num_decode_transfer_queue_reqs",
+            documentation="The number of requests in the decode transfer queue.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.num_bootstrap_failed_reqs = Counter(
+            name="sglang:num_bootstrap_failed_reqs",
+            documentation="The number of bootstrap failed requests.",
+            labelnames=labels.keys(),
+        )
+        self.num_transfer_failed_reqs = Counter(
+            name="sglang:num_transfer_failed_reqs",
+            documentation="The number of transfer failed requests.",
+            labelnames=labels.keys(),
+        )
     def _log_gauge(self, gauge, data: Union[int, float]) -> None:
         # Convenience function for logging to gauge.
         gauge.labels(**self.labels).set(data)
+    def increment_bootstrap_failed_reqs(self) -> None:
+        self.num_bootstrap_failed_reqs.labels(**self.labels).inc(1)
+    def increment_transfer_failed_reqs(self) -> None:
+        self.num_transfer_failed_reqs.labels(**self.labels).inc(1)
     def log_stats(self, stats: SchedulerStats) -> None:
         self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
         self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
         self._log_gauge(self.token_usage, stats.token_usage)
         self._log_gauge(self.gen_throughput, stats.gen_throughput)
         self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
+        self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
         self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
         self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
-        self._log_gauge(self.avg_request_queue_latency, stats.avg_request_queue_latency)
-        self.last_log_time = time.time()
+        # Disaggregation metrics
+        self._log_gauge(
+            self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
+        )
+        self._log_gauge(
+            self.num_prefill_infight_queue_reqs, stats.num_prefill_infight_queue_reqs
+        )
+        self._log_gauge(
+            self.num_decode_prealloc_queue_reqs, stats.num_decode_prealloc_queue_reqs
+        )
+        self._log_gauge(
+            self.num_decode_transfer_queue_reqs, stats.num_decode_transfer_queue_reqs
+        )
+        self.last_log_time = time.perf_counter()
 class TokenizerMetricsCollector:
-    def __init__(self, labels: Dict[str, str]) -> None:
+    def __init__(
+        self,
+        labels: Dict[str, str],
+        bucket_time_to_first_token: Optional[List[float]] = None,
+        bucket_inter_token_latency: Optional[List[float]] = None,
+        bucket_e2e_request_latency: Optional[List[float]] = None,
+        collect_tokens_histogram: bool = False,
+    ) -> None:
         # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
         from prometheus_client import Counter, Histogram
         self.labels = labels
+        self.collect_tokens_histogram = collect_tokens_histogram
         self.prompt_tokens_total = Counter(
             name="sglang:prompt_tokens_total",
@@ -130,6 +324,66 @@ class TokenizerMetricsCollector:
             labelnames=labels.keys(),
         )
+        if collect_tokens_histogram:
+            bucket_prompt_tokens = [
+                100,
+                300,
+                500,
+                700,
+                1000,
+                1500,
+                2000,
+                3000,
+                4000,
+                5000,
+                6000,
+                7000,
+                8000,
+                9000,
+                10000,
+                12000,
+                15000,
+                20000,
+                22000,
+                25000,
+                30000,
+                35000,
+                40000,
+            ]
+            self.prompt_tokens_histogram = Histogram(
+                name="sglang:prompt_tokens_histogram",
+                documentation="Histogram of prompt token length.",
+                labelnames=labels.keys(),
+                buckets=bucket_prompt_tokens,
+            )
+            bucket_generation_tokens = [
+                100,
+                300,
+                500,
+                1000,
+                1200,
+                1500,
+                1700,
+                2000,
+                2500,
+                3000,
+                3500,
+                4000,
+                4500,
+                5000,
+                6000,
+                7000,
+                8000,
+                9000,
+                10000,
+            ]
+            self.generation_tokens_histogram = Histogram(
+                name="sglang:generation_tokens_histogram",
+                documentation="Histogram of generation token length.",
+                labelnames=labels.keys(),
+                buckets=bucket_generation_tokens,
+            )
         self.cached_tokens_total = Counter(
             name="sglang:cached_tokens_total",
             documentation="Number of cached prompt tokens.",
@@ -142,11 +396,14 @@ class TokenizerMetricsCollector:
             labelnames=labels.keys(),
         )
-        self.histogram_time_to_first_token = Histogram(
-            name="sglang:time_to_first_token_seconds",
-            documentation="Histogram of time to first token in seconds.",
+        self.num_so_requests_total = Counter(
+            name="sglang:num_so_requests_total",
+            documentation="Number of structured output requests processed.",
             labelnames=labels.keys(),
-            buckets=[
+        )
+        if bucket_time_to_first_token is None:
+            bucket_time_to_first_token = [
                 0.1,
                 0.2,
                 0.4,
@@ -165,14 +422,33 @@ class TokenizerMetricsCollector:
                 100,
                 200,
                 400,
-            ],
-        )
+            ]
-        self.histogram_inter_token_latency_seconds = Histogram(
-            name="sglang:inter_token_latency_seconds",
-            documentation="Histogram of inter-token latency in seconds.",
-            labelnames=labels.keys(),
-            buckets=[
+        if bucket_e2e_request_latency is None:
+            bucket_e2e_request_latency = [
+                0.1,
+                0.2,
+                0.4,
+                0.6,
+                0.8,
+                1,
+                2,
+                4,
+                6,
+                8,
+                10,
+                20,
+                40,
+                60,
+                80,
+                100,
+                200,
+                400,
+                800,
+            ]
+        if bucket_inter_token_latency is None:
+            bucket_inter_token_latency = [
                 0.002,
                 0.004,
                 0.006,
@@ -196,34 +472,27 @@ class TokenizerMetricsCollector:
                 4.000,
                 6.000,
                 8.000,
-            ],
+            ]
+        self.histogram_time_to_first_token = Histogram(
+            name="sglang:time_to_first_token_seconds",
+            documentation="Histogram of time to first token in seconds.",
+            labelnames=labels.keys(),
+            buckets=bucket_time_to_first_token,
+        )
+        self.histogram_inter_token_latency_seconds = Histogram(
+            name="sglang:inter_token_latency_seconds",
+            documentation="Histogram of inter-token latency in seconds.",
+            labelnames=labels.keys(),
+            buckets=bucket_inter_token_latency,
         )
         self.histogram_e2e_request_latency = Histogram(
             name="sglang:e2e_request_latency_seconds",
             documentation="Histogram of End-to-end request latency in seconds",
             labelnames=labels.keys(),
-            buckets=[
-                0.1,
-                0.2,
-                0.4,
-                0.6,
-                0.8,
-                1,
-                2,
-                4,
-                6,
-                8,
-                10,
-                20,
-                40,
-                60,
-                80,
-                100,
-                200,
-                400,
-                800,
-            ],
+            buckets=bucket_e2e_request_latency,
         )
     def _log_histogram(self, histogram, data: Union[int, float]) -> None:
@@ -235,13 +504,19 @@ class TokenizerMetricsCollector:
         generation_tokens: int,
         cached_tokens: int,
         e2e_latency: float,
+        has_grammar: bool,
     ):
         self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens)
         self.generation_tokens_total.labels(**self.labels).inc(generation_tokens)
         if cached_tokens > 0:
             self.cached_tokens_total.labels(**self.labels).inc(cached_tokens)
         self.num_requests_total.labels(**self.labels).inc(1)
+        if has_grammar:
+            self.num_so_requests_total.labels(**self.labels).inc(1)
         self._log_histogram(self.histogram_e2e_request_latency, e2e_latency)
+        if self.collect_tokens_histogram:
+            self._log_histogram(self.prompt_tokens_histogram, prompt_tokens)
+            self._log_histogram(self.generation_tokens_histogram, generation_tokens)
     def observe_time_to_first_token(self, value: float):
         self.histogram_time_to_first_token.labels(**self.labels).observe(value)

sglang/srt/mm_utils.py CHANGED Viewed

@@ -36,6 +36,16 @@ from io import BytesIO
 import numpy as np
 from PIL import Image
+from sglang.srt.utils import flatten_nested_list
+def has_valid_data(data) -> bool:
+    if data is None:
+        return False
+    if isinstance(data, list):
+        return any(has_valid_data(item) for item in flatten_nested_list(data))
+    return True
 def select_best_resolution(original_size, possible_resolutions):
     """

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -19,7 +19,7 @@ import bisect
 import inspect
 import os
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING, Callable, Optional, Union
 import torch
 import tqdm
@@ -30,6 +30,7 @@ from sglang.srt.distributed.parallel_state import GroupCoordinator, graph_captur
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.moe.fused_moe_native import fused_moe_forward_native
 from sglang.srt.layers.torchao_utils import save_gemlite_cache
+from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import (
     CaptureHiddenMode,
     ForwardBatch,
@@ -40,14 +41,18 @@ from sglang.srt.patch_torch import monkey_patch_torch_compile
 from sglang.srt.utils import (
     get_available_gpu_memory,
     get_device_memory_capacity,
-    is_hip,
     rank0_log,
 )
 if TYPE_CHECKING:
     from sglang.srt.model_executor.model_runner import ModelRunner
-_is_hip = is_hip()
+# Detect whether the current forward pass is in capture mode
+is_capture_mode = False
+def get_is_capture_mode():
+    return is_capture_mode
 def _to_torch(model: torch.nn.Module, reverse: bool, num_tokens: int):
@@ -137,7 +142,6 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
             )
         gpu_mem = get_device_memory_capacity()
-        # Batch size of each rank will not become so large when DP is on
         if gpu_mem is not None and gpu_mem > 96 * 1024:
             capture_bs += list(range(160, 257, 8))
@@ -148,12 +152,15 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
             model_runner.req_to_token_pool.size
         ]
-    capture_bs = list(sorted(set(capture_bs)))
-    assert len(capture_bs) > 0 and capture_bs[0] > 0
-    capture_bs = [bs for bs in capture_bs if bs <= model_runner.req_to_token_pool.size]
     if server_args.cuda_graph_max_bs:
         capture_bs = [bs for bs in capture_bs if bs <= server_args.cuda_graph_max_bs]
+        if max(capture_bs) < server_args.cuda_graph_max_bs:
+            capture_bs += list(
+                range(max(capture_bs), server_args.cuda_graph_max_bs + 1, 16)
+            )
+    capture_bs = [bs for bs in capture_bs if bs <= model_runner.req_to_token_pool.size]
+    capture_bs = list(sorted(set(capture_bs)))
+    assert len(capture_bs) > 0 and capture_bs[0] > 0
     compile_bs = (
         [bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs]
         if server_args.enable_torch_compile
@@ -211,7 +218,10 @@ class CudaGraphRunner:
         # Attention backend
         self.max_bs = max(self.capture_bs)
         self.max_num_token = self.max_bs * self.num_tokens_per_bs
-        self.model_runner.attn_backend.init_cuda_graph_state(self.max_num_token)
+        if global_server_args_dict["attention_backend"] == "flashmla":
+            self.model_runner.attn_backend.init_cuda_graph_state(self.max_bs)
+        else:
+            self.model_runner.attn_backend.init_cuda_graph_state(self.max_num_token)
         self.seq_len_fill_value = (
             self.model_runner.attn_backend.get_cuda_graph_seq_len_fill_value()
         )
@@ -237,6 +247,7 @@ class CudaGraphRunner:
             self.out_cache_loc = torch.zeros((self.max_num_token,), dtype=torch.int64)
             self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
             self.mrope_positions = torch.zeros((3, self.max_bs), dtype=torch.int64)
+            self.num_token_non_padded = torch.zeros((1,), dtype=torch.int32)
             # pipeline parallelism
             if self.pp_size > 1:
@@ -296,28 +307,23 @@ class CudaGraphRunner:
                 self.capture()
         except RuntimeError as e:
             raise Exception(
-                f"Capture cuda graph failed: {e}\n"
+                f"Capture CUDA graph failed: {e}\n"
                 "Possible solutions:\n"
                 "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
                 "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
                 "3. disable torch compile by not using --enable-torch-compile\n"
-                "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
+                "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
                 "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
             )
     @contextmanager
     def model_capture_mode(self):
-        if hasattr(self.model_runner.model, "capture_mode"):
-            self.model_runner.model.capture_mode = True
-        if hasattr(self.model_runner.token_to_kv_pool, "capture_mode"):
-            self.model_runner.token_to_kv_pool.capture_mode = True
+        global is_capture_mode
+        is_capture_mode = True
         yield
-        if hasattr(self.model_runner.model, "capture_mode"):
-            self.model_runner.model.capture_mode = False
-        if hasattr(self.model_runner.token_to_kv_pool, "capture_mode"):
-            self.model_runner.token_to_kv_pool.capture_mode = False
+        is_capture_mode = False
     def can_run(self, forward_batch: ForwardBatch):
         if self.enable_dp_attention or self.enable_sp_layernorm:
@@ -400,6 +406,7 @@ class CudaGraphRunner:
         else:
             encoder_lens = None
         mrope_positions = self.mrope_positions[:, :bs]
+        self.num_token_non_padded[...] = num_tokens
         # pipeline parallelism
         if self.pp_size > 1:
@@ -458,6 +465,7 @@ class CudaGraphRunner:
             spec_info=spec_info,
             capture_hidden_mode=self.capture_hidden_mode,
             lora_paths=lora_paths,
+            num_token_non_padded=self.num_token_non_padded,
         )
         if lora_paths is not None:
@@ -553,6 +561,7 @@ class CudaGraphRunner:
         self.seq_lens[:raw_bs].copy_(forward_batch.seq_lens)
         self.out_cache_loc[:raw_num_token].copy_(forward_batch.out_cache_loc)
         self.positions[:raw_num_token].copy_(forward_batch.positions)
+        self.num_token_non_padded[...] = len(forward_batch.input_ids)
         if forward_batch.seq_lens_cpu is not None:
             if bs != raw_bs:
                 self.seq_lens_cpu.fill_(1)
@@ -605,6 +614,7 @@ class CudaGraphRunner:
         # Replay
         self.graphs[self.bs].replay()
         output = self.output_buffers[self.bs]
         if isinstance(output, LogitsProcessorOutput):
             return LogitsProcessorOutput(

sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post5py3-none-any.whl