PyPI - sglang - Versions diffs - 0.5.1.post3__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl - Mend

sglang 0.5.1.post3py3-none-any.whl → 0.5.2rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

sglang/bench_one_batch.py +3 -0
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/longcat_flash.py +104 -0
sglang/srt/configs/model_config.py +12 -0
sglang/srt/connector/__init__.py +1 -1
sglang/srt/connector/base_connector.py +1 -2
sglang/srt/connector/redis.py +2 -2
sglang/srt/connector/serde/__init__.py +1 -1
sglang/srt/connector/serde/safe_serde.py +4 -3
sglang/srt/disaggregation/ascend/conn.py +75 -0
sglang/srt/disaggregation/launch_lb.py +0 -13
sglang/srt/disaggregation/mini_lb.py +33 -8
sglang/srt/disaggregation/prefill.py +1 -1
sglang/srt/distributed/parallel_state.py +24 -14
sglang/srt/entrypoints/engine.py +19 -12
sglang/srt/entrypoints/http_server.py +174 -34
sglang/srt/entrypoints/openai/protocol.py +60 -0
sglang/srt/eplb/eplb_manager.py +26 -2
sglang/srt/eplb/expert_distribution.py +29 -2
sglang/srt/hf_transformers_utils.py +10 -0
sglang/srt/layers/activation.py +12 -0
sglang/srt/layers/attention/ascend_backend.py +240 -109
sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
sglang/srt/layers/layernorm.py +28 -3
sglang/srt/layers/linear.py +3 -2
sglang/srt/layers/logits_processor.py +1 -1
sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
sglang/srt/layers/moe/ep_moe/layer.py +12 -6
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/topk.py +35 -12
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
sglang/srt/layers/quantization/modelopt_quant.py +7 -0
sglang/srt/layers/quantization/mxfp4.py +9 -4
sglang/srt/layers/quantization/utils.py +13 -0
sglang/srt/layers/quantization/w8a8_int8.py +7 -3
sglang/srt/layers/rotary_embedding.py +28 -1
sglang/srt/layers/sampler.py +29 -5
sglang/srt/managers/cache_controller.py +62 -96
sglang/srt/managers/detokenizer_manager.py +43 -2
sglang/srt/managers/io_struct.py +27 -0
sglang/srt/managers/mm_utils.py +5 -1
sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
sglang/srt/managers/scheduler.py +36 -2
sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
sglang/srt/managers/tokenizer_manager.py +86 -39
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +20 -3
sglang/srt/mem_cache/hiradix_cache.py +75 -68
sglang/srt/mem_cache/lora_radix_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +4 -0
sglang/srt/mem_cache/memory_pool_host.py +2 -4
sglang/srt/mem_cache/radix_cache.py +5 -4
sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +33 -7
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +2 -1
sglang/srt/mem_cache/swa_radix_cache.py +1 -1
sglang/srt/model_executor/model_runner.py +5 -4
sglang/srt/model_loader/loader.py +15 -24
sglang/srt/model_loader/utils.py +12 -0
sglang/srt/models/deepseek_v2.py +26 -10
sglang/srt/models/gpt_oss.py +0 -14
sglang/srt/models/llama_eagle3.py +4 -0
sglang/srt/models/longcat_flash.py +1015 -0
sglang/srt/models/longcat_flash_nextn.py +691 -0
sglang/srt/models/qwen2.py +26 -3
sglang/srt/models/qwen2_5_vl.py +65 -41
sglang/srt/models/qwen2_moe.py +22 -2
sglang/srt/models/transformers.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +4 -2
sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
sglang/srt/server_args.py +112 -55
sglang/srt/speculative/eagle_worker.py +28 -8
sglang/srt/utils.py +14 -0
sglang/test/attention/test_trtllm_mla_backend.py +12 -3
sglang/version.py +1 -1
{sglang-0.5.1.post3.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +5 -5
{sglang-0.5.1.post3.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +83 -78
{sglang-0.5.1.post3.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
{sglang-0.5.1.post3.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.1.post3.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0

sglang/srt/mem_cache/memory_pool_host.py CHANGED Viewed

@@ -7,7 +7,6 @@ from functools import wraps
 import psutil
 import torch
-from sglang.srt.distributed import get_tensor_model_parallel_rank
 from sglang.srt.mem_cache.memory_pool import KVCache, MHATokenToKVPool, MLATokenToKVPool
 from sglang.srt.utils import is_npu
@@ -464,8 +463,7 @@ class MHATokenToKVPoolHost(HostKVCache):
         else:
             raise ValueError(f"Unsupported layout: {self.layout}")
-    def get_buffer_meta(self, keys, indices):
-        local_rank = get_tensor_model_parallel_rank()
+    def get_buffer_meta(self, keys, indices, local_rank):
         ptr_list = []
         key_list = []
         kv_buffer_data_ptr = self.kv_buffer.data_ptr()
@@ -704,7 +702,7 @@ class MLATokenToKVPoolHost(HostKVCache):
         else:
             raise ValueError(f"Unsupported layout: {self.layout}")
-    def get_buffer_meta(self, keys, indices):
+    def get_buffer_meta(self, keys, indices, local_rank):
         ptr_list = []
         key_list = []
         kv_buffer_data_ptr = self.kv_buffer.data_ptr()

sglang/srt/mem_cache/radix_cache.py CHANGED Viewed

@@ -62,7 +62,6 @@ class TreeNode:
         self.host_value: Optional[torch.Tensor] = None
         # store hash values of each pages
         self.hash_value: Optional[List[str]] = None
-        self.backuped_storage = False
         self.id = TreeNode.counter if id is None else id
         TreeNode.counter += 1
@@ -195,7 +194,7 @@ class RadixCache(BasePrefixCache):
             last_host_node=last_node,
         )
-    def insert(self, key: List, value=None):
+    def insert(self, key: List, value=None, chunked=False):
         if self.disable:
             return 0
@@ -240,7 +239,7 @@ class RadixCache(BasePrefixCache):
         self.req_to_token_pool.free(req.req_pool_idx)
         self.dec_lock_ref(req.last_node)
-    def cache_unfinished_req(self, req: Req):
+    def cache_unfinished_req(self, req: Req, chunked=False):
         """Cache request when it is unfinished."""
         if self.disable:
             return
@@ -261,7 +260,9 @@ class RadixCache(BasePrefixCache):
         page_aligned_token_ids = token_ids[:page_aligned_len]
         # Radix Cache takes one ref in memory pool
-        new_prefix_len = self.insert(page_aligned_token_ids, page_aligned_kv_indices)
+        new_prefix_len = self.insert(
+            page_aligned_token_ids, page_aligned_kv_indices, chunked=chunked
+        )
         self.token_to_kv_pool_allocator.free(
             kv_indices[len(req.prefix_indices) : new_prefix_len]
         )

sglang/srt/mem_cache/radix_cache_cpp.py CHANGED Viewed

@@ -181,7 +181,7 @@ class RadixCacheCpp(BasePrefixCache):
         self.dec_lock_ref(req.last_node)
         self.req_to_token_pool.free(req.req_pool_idx)
-    def cache_unfinished_req(self, req: Req):
+    def cache_unfinished_req(self, req: Req, chunked=False):
         """Cache request when it is unfinished."""
         assert req.req_pool_idx is not None
         token_ids = req.fill_ids

sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py CHANGED Viewed

@@ -125,6 +125,7 @@ class HiCacheHF3FS(HiCacheStorage):
         entries: int,
         dtype: torch.dtype,
         metadata_client: Hf3fsMetadataInterface,
+        is_mla_model: bool = False,
     ):
         self.rank = rank
         self.file_path = file_path
@@ -134,9 +135,13 @@ class HiCacheHF3FS(HiCacheStorage):
         self.entries = entries
         self.dtype = dtype
         self.metadata_client = metadata_client
+        self.is_mla_model = is_mla_model
         self.numel = self.bytes_per_page // self.dtype.itemsize
         self.num_pages = self.file_size // self.bytes_per_page
+        self.skip_backup = False
+        if self.is_mla_model and self.rank != 0:
+            self.skip_backup = True
+            self.rank = 0
         logger.info(
             f"[Rank {self.rank}] HiCacheHF3FS Client Initializing: "
@@ -209,10 +214,14 @@ class HiCacheHF3FS(HiCacheStorage):
             raise ValueError(f"Missing required keys in config: {missing_keys}")
         # Choose metadata client based on configuration
+        is_mla_model = False
         if "metadata_server_url" in config and config["metadata_server_url"]:
             # Use global metadata client to connect to metadata server
             metadata_server_url = config["metadata_server_url"]
             metadata_client = Hf3fsGlobalMetadataClient(metadata_server_url)
+            # Enable MLA optimization only when using the global metadata client
+            is_mla_model = storage_config.is_mla_model if storage_config else False
             logger.info(
                 f"Using global metadata client with server url: {metadata_server_url}"
             )
@@ -222,13 +231,15 @@ class HiCacheHF3FS(HiCacheStorage):
         return HiCacheHF3FS(
             rank=rank,
-            file_path=f"{config['file_path_prefix']}.{rank}.bin",
+            # Let all ranks use the same file path for MLA model
+            file_path=f"{config['file_path_prefix']}.{rank if not is_mla_model else 0}.bin",
             file_size=int(config["file_size"]),
             numjobs=int(config["numjobs"]),
             bytes_per_page=bytes_per_page,
             entries=int(config["entries"]),
             dtype=dtype,
             metadata_client=metadata_client,
+            is_mla_model=is_mla_model,
         )
     def get(
@@ -312,6 +323,10 @@ class HiCacheHF3FS(HiCacheStorage):
         target_locations: Optional[Any] = None,
         target_sizes: Optional[Any] = None,
     ) -> bool:
+        # In MLA backend, only one rank needs to backup the KV cache
+        if self.skip_backup:
+            return True
         # Todo: Add prefix block's hash key
         key_with_prefix = [(key, "") for key in keys]
         indices = self.metadata_client.reserve_and_allocate_page_indices(
@@ -363,18 +378,29 @@ class HiCacheHF3FS(HiCacheStorage):
         return all(results)
-    @synchronized()
     def delete(self, key: str) -> None:
         self.metadata_client.delete_keys(self.rank, [key])
-    @synchronized()
     def exists(self, key: str) -> bool:
         result = self.metadata_client.exists(self.rank, [key])
         return result[0] if result else False
-    @synchronized()
-    def clear(self) -> None:
-        self.metadata_client.clear(self.rank)
+    def batch_exists(self, keys: List[str]) -> int:
+        results = self.metadata_client.exists(self.rank, keys)
+        for i in range(len(keys)):
+            if not results[i]:
+                return i
+        return len(keys)
+    def clear(self) -> bool:
+        try:
+            self.metadata_client.clear(self.rank)
+            logger.info(f"Cleared HiCacheHF3FS for rank {self.rank}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to clear HiCacheHF3FS: {e}")
+            return False
     def close(self) -> None:
         try:

sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py CHANGED Viewed

@@ -159,6 +159,7 @@ class MooncakeStore(HiCacheStorage):
     def batch_set(
         self,
         keys: List[str],
+        values: Optional[List[torch.Tensor]] = None,
         target_location: Optional[List[int]] = None,
         target_sizes: Optional[List[int]] = None,
     ) -> bool:
@@ -253,7 +254,7 @@ class MooncakeStore(HiCacheStorage):
         pass
     def clear(self) -> None:
-        raise (NotImplementedError)
+        self.store.remove_all()
     def _put_batch_zero_copy_impl(
         self, key_strs: List[str], buffer_ptrs: List[int], buffer_sizes: List[int]

sglang/srt/mem_cache/swa_radix_cache.py CHANGED Viewed

@@ -464,7 +464,7 @@ class SWARadixCache(BasePrefixCache):
         self.req_to_token_pool.free(req.req_pool_idx)
         self.dec_lock_ref(req.last_node, req.swa_uuid_for_lock)
-    def cache_unfinished_req(self, req: Req) -> None:
+    def cache_unfinished_req(self, req: Req, chunked=False) -> None:
         """Cache request when it is unfinished."""
         if self.disable:
             kv_indices = self.req_to_token_pool.req_to_token[

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -307,7 +307,10 @@ class ModelRunner:
         model_num_layers = (
             self.model_config.num_nextn_predict_layers
             if self.is_draft_worker and model_has_mtp_layers
-            else self.model_config.num_hidden_layers
+            else max(
+                self.model_config.num_hidden_layers,
+                self.model_config.num_attention_layers,
+            )
         )
         self.start_layer = getattr(self.model, "start_layer", 0)
         self.end_layer = getattr(self.model, "end_layer", model_num_layers)
@@ -1440,14 +1443,12 @@ class ModelRunner:
             else self.server_args.attention_backend
         )
         if self.decode_attention_backend_str != self.prefill_attention_backend_str:
-            assert (
-                self.server_args.speculative_algorithm is None
-            ), "Currently HybridAttentionBackend does not support speculative decoding."
             from sglang.srt.layers.attention.hybrid_attn_backend import (
                 HybridAttnBackend,
             )
             attn_backend = HybridAttnBackend(
+                self,
                 decode_backend=self._get_attention_backend_from_str(
                     self.decode_attention_backend_str
                 ),

sglang/srt/model_loader/loader.py CHANGED Viewed

@@ -42,6 +42,7 @@ from sglang.srt.distributed import (
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.model_loader.utils import (
     get_model_architecture,
+    post_load_weights,
     set_default_torch_dtype,
 )
 from sglang.srt.model_loader.weight_utils import (
@@ -600,18 +601,7 @@ class DummyModelLoader(BaseModelLoader):
             # random values to the weights.
             initialize_dummy_weights(model)
-            # Model weight loading consists of two stages:
-            # 1. Initial weight loading.
-            # 2. Post-processing of weights, including assigning specific member variables.
-            # For `dummy_init`, only the second stage is required.
-            if hasattr(model, "post_load_weights"):
-                if (
-                    model_config.hf_config.architectures[0]
-                    == "DeepseekV3ForCausalLMNextN"
-                ):
-                    model.post_load_weights(is_nextn=True)
-                else:
-                    model.post_load_weights()
+            post_load_weights(model, model_config)
         return model.eval()
@@ -751,6 +741,9 @@ class ShardedStateLoader(BaseModelLoader):
                         state_dict.pop(key)
             if state_dict:
                 raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
+            post_load_weights(model, model_config)
         return model.eval()
     @staticmethod
@@ -1421,18 +1414,16 @@ class RemoteModelLoader(BaseModelLoader):
                     # ignore hidden files
                     if file_name.startswith("."):
                         continue
-                    if os.path.splitext(file_name)[1] not in (
-                        ".bin",
-                        ".pt",
-                        ".safetensors",
-                    ):
+                    if os.path.splitext(file_name)[1] in (".json", ".py"):
                         file_path = os.path.join(root, file_name)
                         with open(file_path, encoding="utf-8") as file:
                             file_content = file.read()
                             f_key = f"{model_name}/files/{file_name}"
                             client.setstr(f_key, file_content)
-    def _load_model_from_remote_kv(self, model: nn.Module, client):
+    def _load_model_from_remote_kv(
+        self, model: nn.Module, model_config: ModelConfig, client
+    ):
         for _, module in model.named_modules():
             quant_method = getattr(module, "quant_method", None)
             if quant_method is not None:
@@ -1460,6 +1451,8 @@ class RemoteModelLoader(BaseModelLoader):
         if state_dict:
             raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
+        post_load_weights(model, model_config)
     def _load_model_from_remote_fs(
         self, model, client, model_config: ModelConfig, device_config: DeviceConfig
     ) -> nn.Module:
@@ -1501,15 +1494,13 @@ class RemoteModelLoader(BaseModelLoader):
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model = _initialize_model(model_config, self.load_config)
-                for _, module in model.named_modules():
-                    quant_method = getattr(module, "quant_method", None)
-                    if quant_method is not None:
-                        quant_method.process_weights_after_loading(module)
-            with create_remote_connector(model_weights, device_config.device) as client:
+            with create_remote_connector(
+                model_weights, device=device_config.device
+            ) as client:
                 connector_type = get_connector_type(client)
                 if connector_type == ConnectorType.KV:
-                    self._load_model_from_remote_kv(model, client)
+                    self._load_model_from_remote_kv(model, model_config, client)
                 elif connector_type == ConnectorType.FS:
                     self._load_model_from_remote_fs(
                         model, client, model_config, device_config

sglang/srt/model_loader/utils.py CHANGED Viewed

@@ -105,3 +105,15 @@ def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module],
 def get_architecture_class_name(model_config: ModelConfig) -> str:
     return get_model_architecture(model_config)[1]
+def post_load_weights(model: nn.Module, model_config: ModelConfig):
+    # Model weight loading consists of two stages:
+    # 1. Initial weight loading.
+    # 2. Post-processing of weights, including assigning specific member variables.
+    # For `dummy_init`, only the second stage is required.
+    if hasattr(model, "post_load_weights"):
+        if model_config.hf_config.architectures[0] == "DeepseekV3ForCausalLMNextN":
+            model.post_load_weights(is_nextn=True)
+        else:
+            model.post_load_weights()

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -114,6 +114,7 @@ from sglang.srt.utils import (
     is_flashinfer_available,
     is_hip,
     is_non_idle_and_non_empty,
+    is_npu,
     is_sm100_supported,
     log_info_on_rank0,
     make_layers,
@@ -122,6 +123,7 @@ from sglang.srt.utils import (
 _is_hip = is_hip()
 _is_cuda = is_cuda()
+_is_npu = is_npu()
 _is_fp8_fnuz = is_fp8_fnuz()
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 _is_cpu_amx_available = cpu_has_amx_support()
@@ -1181,13 +1183,19 @@ class DeepseekV2AttentionMLA(nn.Module):
         k[..., : self.qk_nope_head_dim] = k_nope
         k[..., self.qk_nope_head_dim :] = k_pe
-        latent_cache[:, :, : self.kv_lora_rank] = kv_a.unsqueeze(1)
-        latent_cache[:, :, self.kv_lora_rank :] = k_pe
+        if not _is_npu:
+            latent_cache[:, :, : self.kv_lora_rank] = kv_a.unsqueeze(1)
+            latent_cache[:, :, self.kv_lora_rank :] = k_pe
-        # Save latent cache
-        forward_batch.token_to_kv_pool.set_kv_buffer(
-            self.attn_mha, forward_batch.out_cache_loc, latent_cache, None
-        )
+            # Save latent cache
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                self.attn_mha, forward_batch.out_cache_loc, latent_cache, None
+            )
+        else:
+            # To reduce a time-costing split operation
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                self.attn_mha, forward_batch.out_cache_loc, kv_a.unsqueeze(1), k_pe
+            )
         return q, k, v, forward_batch
@@ -2406,18 +2414,26 @@ class DeepseekV2ForCausalLM(nn.Module):
         )
         num_hidden_layers = 1 if is_nextn else self.config.num_hidden_layers
         for layer_id in range(num_hidden_layers):
             if is_nextn:
                 layer = self.model.decoder
             else:
                 layer = self.model.layers[layer_id]
-            for module in [
-                layer.self_attn.fused_qkv_a_proj_with_mqa,
-                layer.self_attn.q_b_proj,
+            module_list = [
                 layer.self_attn.kv_b_proj,
                 layer.self_attn.o_proj,
-            ]:
+            ]
+            if self.config.q_lora_rank is not None:
+                module_list.append(layer.self_attn.fused_qkv_a_proj_with_mqa)
+                module_list.append(layer.self_attn.q_b_proj)
+            else:
+                module_list.append(layer.self_attn.kv_a_proj_with_mqa)
+                module_list.append(layer.self_attn.q_proj)
+            for module in module_list:
                 requant_weight_ue8m0_inplace(
                     module.weight, module.weight_scale_inv, weight_block_size
                 )

sglang/srt/models/gpt_oss.py CHANGED Viewed

@@ -1029,10 +1029,6 @@ class GptOssForCausalLM(nn.Module):
         )
         params_dict = dict(self.named_parameters())
-        params_checker = {k: False for k, v in params_dict.items()}
-        for other_loaded_param_name in other_loaded_param_names:
-            params_checker[other_loaded_param_name] = True
         for name, loaded_weight in weights:
             loaded_weight = _WeightCreator.maybe_materialize(loaded_weight)
@@ -1069,7 +1065,6 @@ class GptOssForCausalLM(nn.Module):
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
-                params_checker[name] = True
                 break
             else:
                 for mapping in expert_params_mapping:
@@ -1092,7 +1087,6 @@ class GptOssForCausalLM(nn.Module):
                         name,
                         shard_id=shard_id,
                     )
-                    params_checker[name] = True
                     break
                 else:
                     if name.endswith(".bias") and name not in params_dict:
@@ -1111,17 +1105,9 @@ class GptOssForCausalLM(nn.Module):
                                 param, "weight_loader", default_weight_loader
                             )
                             weight_loader(param, loaded_weight)
-                        params_checker[name] = True
                     else:
                         logger.warning(f"Parameter {name} not found in params_dict")
-        not_loaded_params = [k for k, v in params_checker.items() if not v]
-        if tp_rank == 0:
-            if len(not_loaded_params) > 0:
-                raise Exception(f"Not all parameters loaded: {not_loaded_params}")
-            else:
-                logging.info("All parameters loaded successfully.")
     def get_embed_and_head(self):
         return self.model.embed_tokens.weight, self.lm_head.weight

sglang/srt/models/llama_eagle3.py CHANGED Viewed

@@ -185,9 +185,13 @@ class LlamaForCausalLMEagle3(LlamaForCausalLM):
         )
         # Llama 3.2 1B Instruct set tie_word_embeddings to True
         # Llama 3.1 8B Instruct set tie_word_embeddings to False
+        self.load_lm_head_from_target = False
         if self.config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:
+            if config.draft_vocab_size is None:
+                self.load_lm_head_from_target = True
+                config.draft_vocab_size = config.vocab_size
             self.lm_head = ParallelLMHead(
                 config.draft_vocab_size,
                 config.hidden_size,

sglang 0.5.1.post3__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl

sglang 0.5.1.post3py3-none-any.whl → 0.5.2rc0py3-none-any.whl