PyPI - sglang - Versions diffs - 0.4.1.post1__py3-none-any.whl → 0.4.1.post2__py3-none-any.whl - Mend

sglang 0.4.1.post1py3-none-any.whl → 0.4.1.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

sglang/bench_offline_throughput.py +1 -0
sglang/srt/configs/model_config.py +11 -2
sglang/srt/layers/attention/__init__.py +0 -1
sglang/srt/layers/attention/flashinfer_backend.py +54 -41
sglang/srt/layers/logits_processor.py +30 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +46 -26
sglang/srt/layers/quantization/fp8.py +42 -2
sglang/srt/layers/quantization/fp8_kernel.py +77 -18
sglang/srt/layers/quantization/fp8_utils.py +8 -2
sglang/srt/managers/io_struct.py +29 -8
sglang/srt/managers/schedule_batch.py +22 -15
sglang/srt/managers/scheduler.py +60 -20
sglang/srt/managers/session_controller.py +102 -27
sglang/srt/managers/tokenizer_manager.py +41 -10
sglang/srt/managers/tp_worker.py +7 -0
sglang/srt/managers/tp_worker_overlap_thread.py +5 -0
sglang/srt/model_executor/forward_batch_info.py +42 -3
sglang/srt/model_executor/model_runner.py +4 -0
sglang/srt/models/llama.py +11 -0
sglang/srt/models/llama_eagle.py +132 -0
sglang/srt/openai_api/adapter.py +60 -2
sglang/srt/openai_api/protocol.py +48 -0
sglang/srt/server.py +26 -3
sglang/srt/server_args.py +17 -30
sglang/srt/speculative/spec_info.py +19 -0
sglang/srt/utils.py +62 -0
sglang/version.py +1 -1
{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/METADATA +3 -3
{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/RECORD +32 -30
{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -12,12 +12,23 @@
 # limitations under the License.
 # ==============================================================================
-from typing import List, Tuple
+import functools
+import json
+import logging
+import os
+from typing import Any, Dict, List, Optional, Tuple
 import torch
 import triton
 import triton.language as tl
+from sglang.srt.utils import get_device_name, is_hip
+is_hip_ = is_hip()
+fp8_type_ = torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn
+logger = logging.getLogger(__name__)
 @triton.jit
 def _per_token_group_quant_fp8(
@@ -65,7 +76,7 @@ def per_token_group_quant_fp8(
     x: torch.Tensor,
     group_size: int,
     eps: float = 1e-10,
-    dtype: torch.dtype = torch.float8_e4m3fn,
+    dtype: torch.dtype = fp8_type_,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
@@ -87,9 +98,13 @@ def per_token_group_quant_fp8(
     assert x.is_contiguous(), "`x` is not contiguous"
     finfo = torch.finfo(dtype)
-    fp8_min = finfo.min
     fp8_max = finfo.max
+    if is_hip_:
+        fp8_max = 224.0
+    fp8_min = -fp8_max
     x_q = torch.empty_like(x, device=x.device, dtype=dtype)
     M = x.numel() // group_size
     N = group_size
@@ -205,6 +220,48 @@ def _w8a8_block_fp8_matmul(
     tl.store(c_ptrs, c, mask=c_mask)
+@functools.lru_cache
+def get_w8a8_block_fp8_configs(
+    N: int, K: int, block_n: int, block_k: int
+) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the w8a8 block fp8 kernel.
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    device_name = get_device_name().replace(" ", "_")
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n}, {block_k}].json"
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name
+    )
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info(
+                "Using configuration from %s for W8A8 Block FP8 kernel.",
+                config_file_path,
+            )
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        (
+            "Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! "
+            "Config file not found at %s"
+        ),
+        config_file_path,
+    )
+    return None
 def w8a8_block_fp8_matmul(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -245,17 +302,22 @@ def w8a8_block_fp8_matmul(
     C_shape = A.shape[:-1] + (N,)
     C = A.new_empty(C_shape, dtype=output_dtype)
-    # TODO(HandH1998):
-    # BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N can be optimized.
-    # BLOCK_SIZE_K must be divisable by block_k
-    # BLOCK_SIZE_N and BLOCK_SIZE_M has no requirements
-    BLOCK_SIZE_M = 128
-    if M < BLOCK_SIZE_M:
-        BLOCK_SIZE_M = triton.next_power_of_2(M)
-        BLOCK_SIZE_M = max(BLOCK_SIZE_M, 16)
-    BLOCK_SIZE_K = block_k
-    assert block_k % BLOCK_SIZE_K == 0
-    BLOCK_SIZE_N = block_n
+    configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
+    if configs:
+        # If an optimal configuration map has been found, look up the
+        # optimal config
+        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+    else:
+        # Default config
+        # Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_size[0],
+            "BLOCK_SIZE_K": block_size[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 3,
+        }
     def grid(META):
         return (
@@ -283,10 +345,7 @@ def w8a8_block_fp8_matmul(
         As.stride(-1),
         Bs.stride(1),
         Bs.stride(0),
-        BLOCK_SIZE_M=BLOCK_SIZE_M,
-        BLOCK_SIZE_N=BLOCK_SIZE_N,
-        BLOCK_SIZE_K=BLOCK_SIZE_K,
-        GROUP_SIZE_M=8,
+        **config,
     )
     return C

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -7,6 +7,9 @@ from sglang.srt.layers.quantization.fp8_kernel import (
     per_token_group_quant_fp8,
     w8a8_block_fp8_matmul,
 )
+from sglang.srt.utils import is_hip
+is_hip_ = is_hip()
 def normalize_e4m3fn_to_e4m3fnuz(
@@ -63,8 +66,11 @@ def input_to_float8(
     finfo = torch.finfo(dtype)
     min_val, max_val = x.aminmax()
     amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
-    scale = finfo.max / amax
-    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    fp8_max = finfo.max
+    if is_hip_:
+        fp8_max = 224.0
+    scale = fp8_max / amax
+    x_scl_sat = (x * scale).clamp(min=-fp8_max, max=fp8_max)
     return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -21,10 +21,20 @@ from dataclasses import dataclass
 from enum import Enum
 from typing import Dict, List, Optional, Tuple, Union
+import torch
 from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling.sampling_params import SamplingParams
+@dataclass
+class SessionParams:
+    id: Optional[str] = None
+    rid: Optional[str] = None
+    offset: Optional[int] = None
+    replace: Optional[bool] = None
 @dataclass
 class GenerateReqInput:
     # The input prompt. It can be a single prompt or a batch of prompts.
@@ -56,10 +66,8 @@ class GenerateReqInput:
     # LoRA related
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
-    # Session id info for continual prompting
-    session: Optional[
-        Union[List[Tuple[str, Optional[str]]], Tuple[str, Optional[str]]]
-    ] = None
+    # Session info for continual prompting
+    session_params: Optional[Union[List[Dict], Dict]] = None
     def normalize_batch_and_arguments(self):
         if (
@@ -221,9 +229,8 @@ class TokenizedGenerateReqInput:
     # The input embeds
     input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
-    # Session id info for continual prompting
-    session_id: Optional[str] = None
-    session_rid: Optional[str] = None
+    # Session info for continual prompting
+    session_params: Optional[SessionParams] = None
 @dataclass
@@ -407,6 +414,18 @@ class UpdateWeightsFromDistributedReqOutput:
     message: str
+@dataclass
+class UpdateWeightsFromTensorReqInput:
+    name: str
+    tensor: torch.Tensor
+@dataclass
+class UpdateWeightsFromTensorReqOutput:
+    success: bool
+    message: str
 @dataclass
 class InitWeightsUpdateGroupReqInput:
     # The master address
@@ -454,6 +473,7 @@ class ProfileReq(Enum):
 @dataclass
 class OpenSessionReqInput:
     capacity_of_str_len: int
+    session_id: Optional[str] = None
 @dataclass
@@ -463,4 +483,5 @@ class CloseSessionReqInput:
 @dataclass
 class OpenSessionReqOutput:
-    session_id: str
+    session_id: Optional[str]
+    success: bool

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -29,7 +29,7 @@ ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
 import dataclasses
 import logging
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Set, Tuple, Union
 import numpy as np
 import torch
@@ -209,6 +209,7 @@ class Req:
         lora_path: Optional[str] = None,
         input_embeds: Optional[List[List[float]]] = None,
         session_id: Optional[str] = None,
+        eos_token_ids: Optional[Set[int]] = None,
     ):
         # Input and output info
         self.rid = rid
@@ -236,6 +237,7 @@ class Req:
         self.finished_reason = None
         self.to_abort = False
         self.stream = stream
+        self.eos_token_ids = eos_token_ids
         # For incremental decoding
         # ----- | --------- read_ids -------|
@@ -395,18 +397,23 @@ class Req:
         last_token_id = self.output_ids[-1]
-        matched_eos = False
-        # Check stop token ids
-        if self.sampling_params.stop_token_ids:
-            matched_eos = last_token_id in self.sampling_params.stop_token_ids
-        if self.tokenizer is not None:
-            matched_eos |= last_token_id == self.tokenizer.eos_token_id
-            if self.tokenizer.additional_stop_token_ids:
-                matched_eos |= last_token_id in self.tokenizer.additional_stop_token_ids
-        if matched_eos and not self.sampling_params.ignore_eos:
-            self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id)
-            return
+        if not self.sampling_params.ignore_eos:
+            matched_eos = False
+            # Check stop token ids
+            if self.sampling_params.stop_token_ids:
+                matched_eos = last_token_id in self.sampling_params.stop_token_ids
+            if self.eos_token_ids:
+                matched_eos |= last_token_id in self.eos_token_ids
+            if self.tokenizer is not None:
+                matched_eos |= last_token_id == self.tokenizer.eos_token_id
+                if self.tokenizer.additional_stop_token_ids:
+                    matched_eos |= (
+                        last_token_id in self.tokenizer.additional_stop_token_ids
+                    )
+            if matched_eos:
+                self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id)
+                return
         # Check stop strings
         if len(self.sampling_params.stop_strs) > 0:
@@ -836,8 +843,8 @@ class ScheduleBatch:
         # TODO (lianmin): Revisit this. It should be seq_len - 1
         self.extend_logprob_start_lens.extend([0] * running_bs)
-    def check_decode_mem(self):
-        bs = len(self.reqs)
+    def check_decode_mem(self, buf_multiplier=1):
+        bs = len(self.reqs) * buf_multiplier
         if self.token_to_kv_pool.available_size() >= bs:
             return True

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -22,7 +22,7 @@ import warnings
 from collections import deque
 from concurrent import futures
 from types import SimpleNamespace
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 import psutil
 import setproctitle
@@ -52,6 +52,8 @@ from sglang.srt.managers.io_struct import (
     UpdateWeightFromDiskReqOutput,
     UpdateWeightsFromDistributedReqInput,
     UpdateWeightsFromDistributedReqOutput,
+    UpdateWeightsFromTensorReqInput,
+    UpdateWeightsFromTensorReqOutput,
 )
 from sglang.srt.managers.schedule_batch import (
     FINISH_ABORT,
@@ -88,7 +90,7 @@ from sglang.utils import get_exception_traceback
 logger = logging.getLogger(__name__)
-# Test retract decode
+# Test retract decode for debugging purposes
 test_retract = get_bool_env_var("SGLANG_TEST_RETRACT")
@@ -127,12 +129,12 @@ class Scheduler:
             )
             if server_args.skip_tokenizer_init:
-                # Directly send to the tokenizer/api
+                # Directly send to the TokenizerManager
                 self.send_to_detokenizer = get_zmq_socket(
                     context, zmq.PUSH, port_args.tokenizer_ipc_name
                 )
             else:
-                # Send to the detokenizer
+                # Send to the DetokenizerManager
                 self.send_to_detokenizer = get_zmq_socket(
                     context, zmq.PUSH, port_args.detokenizer_ipc_name
                 )
@@ -383,7 +385,8 @@ class Scheduler:
             self.process_input_requests(recv_reqs)
             batch = self.get_next_batch_to_run()
-            if self.server_args.enable_dp_attention:
+            if self.server_args.enable_dp_attention:  # TODO: simplify this
                 batch = self.prepare_dp_attn_batch(batch)
             self.cur_batch = batch
@@ -392,7 +395,7 @@ class Scheduler:
                 result = self.run_batch(batch)
                 self.process_batch_result(batch, result)
             else:
-                # Self-check and re-init some states when the server is idle
+                # When the server is idle, so self-check and re-init some states
                 self.check_memory()
                 self.new_token_ratio = self.init_new_token_ratio
@@ -409,12 +412,13 @@ class Scheduler:
             batch = self.get_next_batch_to_run()
             self.cur_batch = batch
             if batch:
                 result = self.run_batch(batch)
                 result_queue.append((batch.copy(), result))
                 if self.last_batch is None:
-                    # A dummy first batch to start the pipeline for overlap scheduler.
+                    # Create a dummy first batch to start the pipeline for overlap scheduler.
                     # It is now used for triggering the sampling_info_done event.
                     tmp_batch = ScheduleBatch(
                         reqs=None,
@@ -424,19 +428,21 @@ class Scheduler:
                     self.process_batch_result(tmp_batch, None)
             if self.last_batch:
+                # Process the results of the last batch
                 tmp_batch, tmp_result = result_queue.popleft()
                 tmp_batch.next_batch_sampling_info = (
                     self.tp_worker.cur_sampling_info if batch else None
                 )
                 self.process_batch_result(tmp_batch, tmp_result)
             elif batch is None:
-                # Self-check and re-init some states when the server is idle
+                # When the server is idle, so self-check and re-init some states
                 self.check_memory()
                 self.new_token_ratio = self.init_new_token_ratio
             self.last_batch = batch
-    def recv_requests(self):
+    def recv_requests(self) -> List[Req]:
+        """Receive results at tp_rank = 0 and broadcast it to all other TP ranks."""
         if self.tp_rank == 0 or self.server_args.enable_dp_attention:
             recv_reqs = []
@@ -478,6 +484,11 @@ class Scheduler:
                 self.send_to_tokenizer.send_pyobj(
                     UpdateWeightsFromDistributedReqOutput(success, message)
                 )
+            elif isinstance(recv_req, UpdateWeightsFromTensorReqInput):
+                success, message = self.update_weights_from_tensor(recv_req)
+                self.send_to_tokenizer.send_pyobj(
+                    UpdateWeightsFromTensorReqOutput(success, message)
+                )
             elif isinstance(recv_req, GetWeightsByNameReqInput):
                 parameter = self.get_weights_by_name(recv_req)
                 self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
@@ -487,8 +498,10 @@ class Scheduler:
                 else:
                     self.stop_profile()
             elif isinstance(recv_req, OpenSessionReqInput):
-                session_id = self.open_session(recv_req)
-                self.send_to_tokenizer.send_pyobj(OpenSessionReqOutput(session_id))
+                session_id, success = self.open_session(recv_req)
+                self.send_to_tokenizer.send_pyobj(
+                    OpenSessionReqOutput(session_id=session_id, success=success)
+                )
             elif isinstance(recv_req, CloseSessionReqInput):
                 self.close_session(recv_req)
             else:
@@ -499,7 +512,11 @@ class Scheduler:
         recv_req: TokenizedGenerateReqInput,
     ):
         # Create a new request
-        if recv_req.session_id is None or recv_req.session_id not in self.sessions:
+        if (
+            recv_req.session_params is None
+            or recv_req.session_params.id is None
+            or recv_req.session_params.id not in self.sessions
+        ):
             if recv_req.input_embeds is not None:
                 # Generate fake input_ids based on the length of input_embeds
@@ -517,18 +534,22 @@ class Scheduler:
                 stream=recv_req.stream,
                 lora_path=recv_req.lora_path,
                 input_embeds=recv_req.input_embeds,
+                eos_token_ids=self.model_config.hf_eos_token_id,
             )
             req.tokenizer = self.tokenizer
-            if recv_req.session_id is not None:
+            if (
+                recv_req.session_params is not None
+                and recv_req.session_params.id is not None
+            ):
                 req.finished_reason = FINISH_ABORT(
-                    f"Invalid request: session id {recv_req.session_id} does not exist"
+                    f"Invalid request: session id {recv_req.session_params.id} does not exist"
                 )
                 self.waiting_queue.append(req)
                 return
         else:
-            # Create a new request from a previsou session
-            session = self.sessions[recv_req.session_id]
+            # Create a new request from a previous session
+            session = self.sessions[recv_req.session_params.id]
             req = session.create_req(recv_req, self.tokenizer)
             if isinstance(req.finished_reason, FINISH_ABORT):
                 self.waiting_queue.append(req)
@@ -804,6 +825,8 @@ class Scheduler:
                 if res == AddReqResult.NO_TOKEN:
                     self.batch_is_full = True
                 break
+            if self.server_args.prefill_only_one_req:
+                break
         # Update waiting queue
         can_run_list = adder.can_run_list
@@ -1457,6 +1480,17 @@ class Scheduler:
             logger.error(message)
         return success, message
+    def update_weights_from_tensor(self, recv_req: UpdateWeightsFromTensorReqInput):
+        """Update the online model parameter from tensors."""
+        success, message = self.tp_worker.update_weights_from_tensor(recv_req)
+        # TODO extract common code b/t update_weights_from_distributed and update_weights_from_tensor later
+        if success:
+            flash_cache_success = self.flush_cache()
+            assert flash_cache_success, "Cache flush failed after updating weights"
+        else:
+            logger.error(message)
+        return success, message
     def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
         parameter = self.tp_worker.get_weights_by_name(recv_req)
         return parameter
@@ -1475,16 +1509,20 @@ class Scheduler:
         )
         logger.info("Profiler is done")
-    def open_session(self, recv_req: OpenSessionReqInput) -> str:
+    def open_session(self, recv_req: OpenSessionReqInput) -> Tuple[Optional[str], bool]:
         # handle error
         session_id = recv_req.session_id
         if session_id in self.sessions:
             logger.warning(f"session id {session_id} already exist, cannot open.")
+            return session_id, False
+        elif session_id is None:
+            logger.warning(f"session id is None, cannot open.")
+            return session_id, False
         else:
             self.sessions[session_id] = Session(
                 recv_req.capacity_of_str_len, session_id
             )
-        return session_id
+            return session_id, True
     def close_session(self, recv_req: CloseSessionReqInput):
         # handle error
@@ -1509,18 +1547,20 @@ def run_scheduler_process(
     if dp_rank is None and "SGLANG_DP_RANK" in os.environ:
         dp_rank = int(os.environ["SGLANG_DP_RANK"])
+    # Configue the logger
     if dp_rank is None:
         configure_logger(server_args, prefix=f" TP{tp_rank}")
     else:
         configure_logger(server_args, prefix=f" DP{dp_rank} TP{tp_rank}")
+    suppress_other_loggers()
-    # set cpu affinity to this gpu process
+    # Set cpu affinity to this gpu process
     if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
         set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
-    suppress_other_loggers()
     parent_process = psutil.Process().parent()
+    # Create a scheduler and run the event loop
     try:
         scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)
         pipe_writer.send(

sglang 0.4.1.post1__py3-none-any.whl → 0.4.1.post2__py3-none-any.whl

sglang 0.4.1.post1py3-none-any.whl → 0.4.1.post2py3-none-any.whl