PyPI - sglang - Versions diffs - 0.4.1.post2__py3-none-any.whl → 0.4.1.post4__py3-none-any.whl - Mend

sglang 0.4.1.post2py3-none-any.whl → 0.4.1.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (173) hide show

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -17,7 +17,7 @@ import gc
 import json
 import logging
 import time
-from typing import Optional
+from typing import List, Optional, Tuple
 import torch
 import torch.distributed as dist
@@ -48,8 +48,8 @@ from sglang.srt.mem_cache.memory_pool import (
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader import get_model
-from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.server_args import ServerArgs
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.utils import (
     enable_show_time_cost,
     get_available_gpu_memory,
@@ -75,6 +75,7 @@ class ModelRunner:
         tp_size: int,
         nccl_port: int,
         server_args: ServerArgs,
+        is_draft_worker: bool = False,
     ):
         # Parse args
         self.model_config = model_config
@@ -85,8 +86,12 @@ class ModelRunner:
         self.tp_size = tp_size
         self.dist_port = nccl_port
         self.server_args = server_args
+        self.is_draft_worker = is_draft_worker
         self.is_generation = model_config.is_generation
         self.is_multimodal = model_config.is_multimodal
+        self.spec_algorithm = SpeculativeAlgorithm.from_string(
+            server_args.speculative_algorithm
+        )
         # Model-specific adjustment
         if (
@@ -192,9 +197,9 @@ class ModelRunner:
         torch.get_device_module(self.device).set_device(self.gpu_id)
         if self.device == "cuda":
             backend = "nccl"
-        # ToDO(liangan1):Just use gloo to bypass the initilization fail
-        # Need to use xccl for xpu backend in the future
         elif self.device == "xpu":
+            # TODO(liangan1):Just use gloo to bypass the initilization fail
+            # Need to use xccl for xpu backend in the future
             backend = "gloo"
         elif self.device == "hpu":
             backend = "hccl"
@@ -206,14 +211,18 @@ class ModelRunner:
         else:
             dist_init_method = f"tcp://127.0.0.1:{self.dist_port}"
         set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
-        init_distributed_environment(
-            backend=backend,
-            world_size=self.tp_size,
-            rank=self.tp_rank,
-            local_rank=self.gpu_id,
-            distributed_init_method=dist_init_method,
-        )
-        initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
+        if not self.is_draft_worker:
+            # Only initilzie the distributed environment on the target model worker.
+            init_distributed_environment(
+                backend=backend,
+                world_size=self.tp_size,
+                rank=self.tp_rank,
+                local_rank=self.gpu_id,
+                distributed_init_method=dist_init_method,
+            )
+            initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
         min_per_gpu_memory = get_available_gpu_memory(
             self.device, self.gpu_id, distributed=self.tp_size > 1
         )
@@ -408,7 +417,6 @@ class ModelRunner:
         target_dtype = (
             dtype if isinstance(dtype, torch.dtype) else getattr(torch, dtype)
         )
-        current_dtype = self.dtype if isinstance(self.dtype, str) else self.dtype
         assert (
             self._model_update_group is not None
@@ -429,9 +437,9 @@ class ModelRunner:
             logger.error(error_msg)
             return False, error_msg
-    def update_weights_from_tensor(self, name, tensor: torch.Tensor):
-        self.model.load_weights([(name, tensor)])
-        return True, "Success"  # TODO error handling
+    def update_weights_from_tensor(self, named_tensors: List[Tuple[str, torch.Tensor]]):
+        self.model.load_weights(named_tensors)
+        return True, "Success"
     def get_weights_by_name(
         self, name: str, truncate_size: int = 100
@@ -507,6 +515,28 @@ class ModelRunner:
             )
         self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory)
+        if max_num_reqs is None:
+            max_num_reqs = min(
+                max(
+                    int(
+                        self.max_total_num_tokens / self.model_config.context_len * 512
+                    ),
+                    2048,
+                ),
+                4096,
+            )
+        if not self.spec_algorithm.is_none():
+            if self.is_draft_worker:
+                self.max_total_num_tokens = self.server_args.draft_runner_cache_size
+            else:
+                self.server_args.draft_runner_cache_size = (
+                    self.max_total_num_tokens
+                    + max_num_reqs * self.server_args.speculative_num_steps
+                    + 100
+                )
         if max_total_tokens is not None:
             if max_total_tokens > self.max_total_num_tokens:
                 logging.warning(
@@ -521,17 +551,6 @@ class ModelRunner:
                 "Not enough memory. Please try to increase --mem-fraction-static."
             )
-        if max_num_reqs is None:
-            max_num_reqs = min(
-                max(
-                    int(
-                        self.max_total_num_tokens / self.model_config.context_len * 512
-                    ),
-                    2048,
-                ),
-                4096,
-            )
         self.req_to_token_pool = ReqToTokenPool(
             size=max_num_reqs + 1,
             max_context_len=self.model_config.context_len + 4,
@@ -651,10 +670,6 @@ class ModelRunner:
         tensor_parallel(self.model, device_mesh)
     def forward_decode(self, forward_batch: ForwardBatch):
-        if self.cuda_graph_runner and self.cuda_graph_runner.can_run(forward_batch):
-            return self.cuda_graph_runner.replay(forward_batch)
-        forward_batch.positions = (forward_batch.seq_lens - 1).to(torch.int64)
         self.attn_backend.init_forward_metadata(forward_batch)
         return self.model.forward(
             forward_batch.input_ids, forward_batch.positions, forward_batch
@@ -684,14 +699,18 @@ class ModelRunner:
             )
     def forward_idle(self, forward_batch: ForwardBatch):
-        if self.cuda_graph_runner and self.cuda_graph_runner.can_run(forward_batch):
-            return self.cuda_graph_runner.replay(forward_batch)
         return self.model.forward(
             forward_batch.input_ids, forward_batch.positions, forward_batch
         )
     def forward(self, forward_batch: ForwardBatch) -> LogitsProcessorOutput:
+        if (
+            forward_batch.forward_mode.is_cuda_graph()
+            and self.cuda_graph_runner
+            and self.cuda_graph_runner.can_run(forward_batch)
+        ):
+            return self.cuda_graph_runner.replay(forward_batch)
         if forward_batch.forward_mode.is_decode():
             return self.forward_decode(forward_batch)
         elif forward_batch.forward_mode.is_extend():
@@ -704,6 +723,7 @@ class ModelRunner:
     def sample(
         self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
     ) -> torch.Tensor:
+        # Apply logit bias
         sampling_info = forward_batch.sampling_info
         if sampling_info.sampling_info_done:
             # Overlap mode: the function update_regex_vocab_mask was executed
@@ -714,35 +734,17 @@ class ModelRunner:
             # Normal mode: Put CPU-heavy tasks here. They will be overlapped with the forward pass.
             sampling_info.update_regex_vocab_mask()
             sampling_info.update_penalties()
-        logits = self.apply_logits_bias(logits_output.next_token_logits, sampling_info)
-        # Sample the next tokens.
-        next_token_ids = self.sampler(logits, sampling_info)
+        sampling_info.apply_logits_bias(logits_output.next_token_logits)
+        # Sample the next tokens
+        next_token_ids = self.sampler(
+            logits_output,
+            sampling_info,
+            forward_batch.return_logprob,
+            forward_batch.top_logprobs_nums,
+        )
         return next_token_ids
-    def apply_logits_bias(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo):
-        # Apply logit_bias
-        if sampling_info.logit_bias is not None:
-            logits.add_(sampling_info.logit_bias)
-        # min-token, presence, frequency
-        if sampling_info.linear_penalties is not None:
-            logits.add_(sampling_info.linear_penalties)
-        # repetition
-        if sampling_info.scaling_penalties is not None:
-            logits = torch.where(
-                logits > 0,
-                logits / sampling_info.scaling_penalties,
-                logits * sampling_info.scaling_penalties,
-            )
-        # Apply regex vocab_mask
-        if sampling_info.vocab_mask is not None:
-            sampling_info.apply_mask(logits=logits, vocab_mask=sampling_info.vocab_mask)
-        return logits
     @property
     def model_is_mrope(self) -> bool:
         """Detect if the model has "mrope" rope_scaling type.

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -46,6 +46,7 @@ from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.fp8_utils import (
     block_quant_to_tensor_quant,
     input_to_float8,
+    normalize_e4m3fn_to_e4m3fnuz,
 )
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.vocab_parallel_embedding import (
@@ -55,7 +56,9 @@ from sglang.srt.layers.vocab_parallel_embedding import (
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import is_flashinfer_available, is_hip
+is_hip_ = is_hip()
 if is_flashinfer_available():
     from flashinfer import bmm_fp8
@@ -573,7 +576,13 @@ class DeepseekV2AttentionMLA(nn.Module):
             )
         q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
-        if self.w_kc.dtype == torch.float8_e4m3fn:
+        if self.w_kc.dtype == torch.float8_e4m3fnuz:
+            # TODO(kernel): add bmm_fp8 for torch.float8_e4m3fnuz
+            q_nope_out = torch.bmm(
+                q_nope.to(torch.bfloat16).transpose(0, 1),
+                self.w_kc.to(torch.bfloat16) * self.w_scale,
+            )
+        elif self.w_kc.dtype == torch.float8_e4m3fn:
             q_nope_val, q_nope_scale = input_to_float8(
                 q_nope.transpose(0, 1), torch.float8_e4m3fn
             )
@@ -598,7 +607,13 @@ class DeepseekV2AttentionMLA(nn.Module):
         attn_output = self.attn_mqa(q_input, k_input, v_input, forward_batch)
         attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
-        if self.w_vc.dtype == torch.float8_e4m3fn:
+        if self.w_vc.dtype == torch.float8_e4m3fnuz:
+            # TODO(kernel): add bmm_fp8 for torch.float8_e4m3fnuz
+            attn_bmm_output = torch.bmm(
+                attn_output.to(torch.bfloat16).transpose(0, 1),
+                self.w_vc.to(torch.bfloat16) * self.w_scale,
+            )
+        elif self.w_vc.dtype == torch.float8_e4m3fn:
             attn_output_val, attn_output_scale = input_to_float8(
                 attn_output.transpose(0, 1), torch.float8_e4m3fn
             )
@@ -940,15 +955,25 @@ class DeepseekV2ForCausalLM(nn.Module):
                     w = self_attn.kv_b_proj.weight
                 # NOTE(HandH1998): Since `bmm_fp8` only supports per-tensor scale, we have to requantize `self_attn.kv_b_proj`.
                 # This may affect the accuracy of fp8 model.
-                if (
-                    hasattr(self.quant_config, "weight_block_size")
-                    and w.dtype == torch.float8_e4m3fn
+                if hasattr(self.quant_config, "weight_block_size") and w.dtype in (
+                    torch.float8_e4m3fn,
+                    torch.float8_e4m3fnuz,
                 ):
                     weight_block_size = self.quant_config.weight_block_size
                     if weight_block_size is not None:
                         assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                        if is_hip_:
+                            weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                                weight=w,
+                                weight_scale=self_attn.kv_b_proj.weight_scale_inv,
+                                input_scale=None,
+                            )
+                        else:
+                            weight = w
+                            weight_scale = self_attn.kv_b_proj.weight_scale_inv
                         w, scale = block_quant_to_tensor_quant(
-                            w, self_attn.kv_b_proj.weight_scale_inv, weight_block_size
+                            weight, weight_scale, weight_block_size
                         )
                         self_attn.w_scale = scale
                 w_kc, w_vc = w.unflatten(
@@ -961,6 +986,8 @@ class DeepseekV2ForCausalLM(nn.Module):
                     and self_attn.w_scale is None
                 ):
                     self_attn.w_scale = self_attn.kv_b_proj.weight_scale
+                    if is_hip_:
+                        self_attn.w_scale *= 2.0
 class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):

sglang/srt/models/grok.py CHANGED Viewed

@@ -16,13 +16,16 @@
 # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/mixtral.py#L1
 """Inference-only Grok1 model."""
-from typing import Iterable, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
 import torch
 import torch.nn.functional as F
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from sglang.srt.layers.activation import GeluAndMul
@@ -42,6 +45,7 @@ from sglang.srt.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.loader import DefaultModelLoader
 from sglang.srt.model_loader.weight_utils import default_weight_loader
@@ -347,6 +351,16 @@ class Grok1ForCausalLM(nn.Module):
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config)
+        # Monkey patch _prepare_weights to load pre-sharded weights
+        if (
+            self.config.num_local_experts > 0
+            and get_tensor_model_parallel_world_size() > 1
+        ):
+            self.use_presharded_weights = True
+            setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights)
+        else:
+            self.use_presharded_weights = False
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -359,7 +373,15 @@ class Grok1ForCausalLM(nn.Module):
             input_ids, hidden_states, self.lm_head, forward_batch
         )
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+        use_presharded_weights: bool | None = None,
+    ):
+        if use_presharded_weights is None:
+            use_presharded_weights = self.use_presharded_weights
+        num_experts = self.config.num_local_experts
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -375,10 +397,23 @@ class Grok1ForCausalLM(nn.Module):
             ckpt_gate_proj_name="w1",
             ckpt_down_proj_name="w2",
             ckpt_up_proj_name="w3",
-            num_experts=self.config.num_local_experts,
+            num_experts=num_experts,
         )
         params_dict = dict(self.named_parameters())
+        all_names = set(params_dict.keys())
+        hit_names = set()
+        def load_weight_wrapper(name, loaded_weight, *args, **kwargs):
+            if name not in params_dict:
+                return
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight, *args, **kwargs)
+            hit_names.add(name)
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -391,9 +426,7 @@ class Grok1ForCausalLM(nn.Module):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
+                load_weight_wrapper(name, loaded_weight, shard_id)
                 break
             else:
                 for mapping in expert_params_mapping:
@@ -402,38 +435,76 @@ class Grok1ForCausalLM(nn.Module):
                         continue
                     name = name.replace(weight_name, param_name)
-                    if (
-                        name.endswith(".bias") or name.endswith("_bias")
-                    ) and name not in params_dict:
-                        continue
+                    if use_presharded_weights:
+                        extra_kwargs = {
+                            "use_presharded_weights": use_presharded_weights
+                        }
+                    else:
+                        extra_kwargs = {}
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(
-                        param,
+                    load_weight_wrapper(
+                        name,
                         loaded_weight,
                         name,
                         shard_id=shard_id,
                         expert_id=expert_id,
+                        **extra_kwargs,
                     )
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
-                    if (
-                        name.endswith(".bias") or name.endswith("_bias")
-                    ) and name not in params_dict:
-                        continue
-                    # Skip loading kv_scale from ckpts towards new design.
-                    if name.endswith(".kv_scale") and name not in params_dict:
+                    if name.endswith(".bias") and name not in params_dict:
                         continue
                     if name is None:
                         continue
-                    param = params_dict[name]
-                    weight_loader = getattr(
-                        param, "weight_loader", default_weight_loader
-                    )
-                    weight_loader(param, loaded_weight)
+                    load_weight_wrapper(name=name, loaded_weight=loaded_weight)
+old_prepare_weights = getattr(DefaultModelLoader, "_prepare_weights")
+def _prepare_presharded_weights(
+    self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool
+) -> Tuple[str, List[str], bool]:
+    import glob
+    import os
+    if get_tensor_model_parallel_world_size() == 1:
+        return old_prepare_weights(self, model_name_or_path, revision, fall_back_to_pt)
+    if not os.path.isdir(model_name_or_path):
+        from sglang.srt.model_loader.weight_utils import download_weights_from_hf
+        allow_patterns = ["*.safetensors", "*.bin"]
+        hf_folder = download_weights_from_hf(
+            model_name_or_path,
+            self.load_config.download_dir,
+            allow_patterns,
+            revision,
+            ignore_patterns=self.load_config.ignore_patterns,
+        )
+    else:
+        hf_folder = model_name_or_path
+    tp_rank = get_tensor_model_parallel_rank()
+    # The old format
+    allow_patterns = [f"*-{tp_rank:03d}.bin"]
+    # The new format
+    allow_patterns += [f"*-TP-{tp_rank:03d}.safetensors", "*-TP-common.safetensors"]
+    hf_weights_files: List[str] = []
+    for pattern in allow_patterns:
+        hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
+    if hf_weights_files[0].endswith("safetensors"):
+        use_safetensors = True
+    else:
+        use_safetensors = False
+    return hf_folder, hf_weights_files, use_safetensors
 class Grok1ModelForCausalLM(Grok1ForCausalLM):

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -696,14 +696,6 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
 async def v1_completions(tokenizer_manager, raw_request: Request):
     request_json = await raw_request.json()
-    if "extra_body" in request_json:
-        extra = request_json["extra_body"]
-        if "ebnf" in extra:
-            request_json["ebnf"] = extra["ebnf"]
-        if "regex" in extra:
-            request_json["regex"] = extra["regex"]
-        # remove extra_body to avoid pydantic conflict
-        del request_json["extra_body"]
     all_requests = [CompletionRequest(**request_json)]
     adapted_request, request = v1_generate_request(all_requests)
@@ -1176,15 +1168,6 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
 async def v1_chat_completions(tokenizer_manager, raw_request: Request):
     request_json = await raw_request.json()
-    if "extra_body" in request_json:
-        extra = request_json["extra_body"]
-        # For example, if 'ebnf' is given:
-        if "ebnf" in extra:
-            request_json["ebnf"] = extra["ebnf"]
-        if "regex" in extra:
-            request_json["regex"] = extra["regex"]
-        # remove extra_body to avoid pydantic conflict
-        del request_json["extra_body"]
     all_requests = [ChatCompletionRequest(**request_json)]
     adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -171,15 +171,15 @@ class CompletionRequest(BaseModel):
     top_k: int = -1
     min_p: float = 0.0
     min_tokens: int = 0
-    regex: Optional[str] = None
     json_schema: Optional[str] = None
+    regex: Optional[str] = None
+    ebnf: Optional[str] = None
     repetition_penalty: float = 1.0
     stop_token_ids: Optional[List[int]] = None
     no_stop_trim: bool = False
     ignore_eos: bool = False
     skip_special_tokens: bool = True
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
-    ebnf: Optional[str] = None
 class CompletionResponseChoice(BaseModel):
@@ -315,13 +315,13 @@ class ChatCompletionRequest(BaseModel):
     min_p: float = 0.0
     min_tokens: int = 0
     regex: Optional[str] = None
+    ebnf: Optional[str] = None
     repetition_penalty: float = 1.0
     stop_token_ids: Optional[List[int]] = None
     no_stop_trim: bool = False
     ignore_eos: bool = False
     skip_special_tokens: bool = True
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
-    ebnf: Optional[str] = None
 class FunctionResponse(BaseModel):

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -232,3 +232,24 @@ class SamplingBatchInfo:
         self.logit_bias = SamplingBatchInfo.merge_bias_tensor(
             self.logit_bias, other.logit_bias, len(self), len(other), self.device
         )
+    def apply_logits_bias(self, logits: torch.Tensor):
+        # Apply logit_bias
+        if self.logit_bias is not None:
+            logits.add_(self.logit_bias)
+        # min-token, presence, frequency
+        if self.linear_penalties is not None:
+            logits.add_(self.linear_penalties)
+        # repetition
+        if self.scaling_penalties is not None:
+            logits[:] = torch.where(
+                logits > 0,
+                logits / self.scaling_penalties,
+                logits * self.scaling_penalties,
+            )
+        # Apply regex vocab_mask
+        if self.vocab_mask is not None:
+            self.apply_mask(logits=logits, vocab_mask=self.vocab_mask)

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -19,6 +19,14 @@ _SAMPLING_EPS = 1e-6
 class SamplingParams:
+    """
+    The sampling parameters.
+    See docs/references/sampling_params.md or
+    https://sgl-project.github.io/references/sampling_params.html
+    for the documentation.
+    """
     def __init__(
         self,
         max_new_tokens: int = 128,
@@ -33,9 +41,9 @@ class SamplingParams:
         repetition_penalty: float = 1.0,
         min_new_tokens: int = 0,
         spaces_between_special_tokens: bool = True,
-        regex: Optional[str] = None,
         n: int = 1,
         json_schema: Optional[str] = None,
+        regex: Optional[str] = None,
         ebnf: Optional[str] = None,
         no_stop_trim: bool = False,
         ignore_eos: bool = False,

sglang/srt/server.py CHANGED Viewed

@@ -27,7 +27,9 @@ import signal
 import threading
 import time
 from http import HTTPStatus
-from typing import AsyncIterator, Dict, List, Optional, Union
+from typing import AsyncIterator, Dict, List, Optional, Tuple, Union
+import torch
 # Fix a bug of Python threading
 setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
@@ -78,6 +80,7 @@ from sglang.srt.openai_api.adapter import (
 from sglang.srt.openai_api.protocol import ModelCard, ModelList
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import (
+    MultiprocessingSerializer,
     add_api_key_middleware,
     add_prometheus_middleware,
     assert_pkg_version,
@@ -872,9 +875,11 @@ class Engine:
             tokenizer_manager.update_weights_from_distributed(obj, None)
         )
-    def update_weights_from_tensor(self, name, tensor):
+    def update_weights_from_tensor(self, named_tensors: List[Tuple[str, torch.Tensor]]):
         """Update weights from distributed source."""
-        obj = UpdateWeightsFromTensorReqInput(name=name, tensor=tensor)
+        obj = UpdateWeightsFromTensorReqInput(
+            serialized_named_tensors=MultiprocessingSerializer.serialize(named_tensors)
+        )
         loop = asyncio.get_event_loop()
         return loop.run_until_complete(
             tokenizer_manager.update_weights_from_tensor(obj, None)
@@ -910,10 +915,9 @@ class Runtime:
         atexit.register(self.shutdown)
         # Pre-allocate ports
-        for port in range(10000, 40000):
+        for port in range(self.server_args.port, 40000):
             if is_port_available(port):
                 break
-            port += 1
         self.server_args.port = port
         self.url = self.server_args.url()

sglang 0.4.1.post2__py3-none-any.whl → 0.4.1.post4__py3-none-any.whl

sglang 0.4.1.post2py3-none-any.whl → 0.4.1.post4py3-none-any.whl