PyPI - sglang - Versions diffs - 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

sglang 0.3.3py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

sglang/bench_latency.py +31 -13
sglang/bench_server_latency.py +21 -10
sglang/bench_serving.py +101 -7
sglang/global_config.py +0 -1
sglang/srt/conversation.py +11 -2
sglang/srt/layers/attention/__init__.py +27 -5
sglang/srt/layers/attention/double_sparsity_backend.py +281 -0
sglang/srt/layers/attention/flashinfer_backend.py +352 -83
sglang/srt/layers/attention/triton_backend.py +6 -4
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +772 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -3
sglang/srt/layers/attention/triton_ops/prefill_attention.py +4 -2
sglang/srt/layers/sampler.py +6 -2
sglang/srt/managers/data_parallel_controller.py +177 -0
sglang/srt/managers/detokenizer_manager.py +31 -10
sglang/srt/managers/io_struct.py +11 -2
sglang/srt/managers/schedule_batch.py +126 -43
sglang/srt/managers/schedule_policy.py +2 -1
sglang/srt/managers/scheduler.py +245 -142
sglang/srt/managers/tokenizer_manager.py +14 -1
sglang/srt/managers/tp_worker.py +111 -1
sglang/srt/mem_cache/chunk_cache.py +8 -4
sglang/srt/mem_cache/memory_pool.py +77 -4
sglang/srt/mem_cache/radix_cache.py +15 -7
sglang/srt/model_executor/cuda_graph_runner.py +4 -4
sglang/srt/model_executor/forward_batch_info.py +16 -21
sglang/srt/model_executor/model_runner.py +100 -36
sglang/srt/models/baichuan.py +2 -3
sglang/srt/models/chatglm.py +5 -6
sglang/srt/models/commandr.py +1 -2
sglang/srt/models/dbrx.py +1 -2
sglang/srt/models/deepseek.py +4 -5
sglang/srt/models/deepseek_v2.py +5 -6
sglang/srt/models/exaone.py +1 -2
sglang/srt/models/gemma.py +2 -2
sglang/srt/models/gemma2.py +5 -5
sglang/srt/models/gpt_bigcode.py +5 -5
sglang/srt/models/grok.py +1 -2
sglang/srt/models/internlm2.py +1 -2
sglang/srt/models/llama.py +1 -2
sglang/srt/models/llama_classification.py +1 -2
sglang/srt/models/llama_reward.py +2 -3
sglang/srt/models/llava.py +4 -8
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +1 -2
sglang/srt/models/minicpm3.py +5 -6
sglang/srt/models/mixtral.py +1 -2
sglang/srt/models/mixtral_quant.py +1 -2
sglang/srt/models/olmo.py +352 -0
sglang/srt/models/olmoe.py +1 -2
sglang/srt/models/qwen.py +1 -2
sglang/srt/models/qwen2.py +1 -2
sglang/srt/models/qwen2_moe.py +4 -5
sglang/srt/models/stablelm.py +1 -2
sglang/srt/models/torch_native_llama.py +1 -2
sglang/srt/models/xverse.py +1 -2
sglang/srt/models/xverse_moe.py +4 -5
sglang/srt/models/yivl.py +1 -2
sglang/srt/openai_api/adapter.py +97 -52
sglang/srt/openai_api/protocol.py +10 -2
sglang/srt/sampling/penaltylib/orchestrator.py +28 -9
sglang/srt/sampling/sampling_batch_info.py +105 -59
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server.py +171 -37
sglang/srt/server_args.py +127 -48
sglang/srt/utils.py +37 -14
sglang/test/few_shot_gsm8k.py +4 -1
sglang/test/few_shot_gsm8k_engine.py +144 -0
sglang/test/srt/sampling/penaltylib/utils.py +16 -12
sglang/version.py +1 -1
{sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/METADATA +82 -32
sglang-0.3.4.dist-info/RECORD +143 -0
{sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/WHEEL +1 -1
sglang/srt/layers/attention/flashinfer_utils.py +0 -237
sglang-0.3.3.dist-info/RECORD +0 -139
{sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/LICENSE +0 -0
{sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -18,6 +18,7 @@ limitations under the License.
 import gc
 import importlib
 import importlib.resources
+import json
 import logging
 import pkgutil
 from functools import lru_cache
@@ -39,6 +40,7 @@ from vllm.model_executor.models import ModelRegistry
 from sglang.srt.configs.model_config import AttentionArch, ModelConfig
 from sglang.srt.constrained import disable_cache
+from sglang.srt.layers.attention.double_sparsity_backend import DoubleSparseAttnBackend
 from sglang.srt.layers.attention.flashinfer_backend import FlashInferAttnBackend
 from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
@@ -46,6 +48,7 @@ from sglang.srt.layers.sampler import Sampler
 from sglang.srt.lora.lora_manager import LoRAManager
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.mem_cache.memory_pool import (
+    DoubleSparseTokenToKVPool,
     MHATokenToKVPool,
     MLATokenToKVPool,
     ReqToTokenPool,
@@ -81,10 +84,11 @@ class ModelRunner:
         # Parse args
         self.model_config = model_config
         self.mem_fraction_static = mem_fraction_static
+        self.device = server_args.device
         self.gpu_id = gpu_id
         self.tp_rank = tp_rank
         self.tp_size = tp_size
-        self.nccl_port = nccl_port
+        self.dist_port = nccl_port
         self.server_args = server_args
         self.is_multimodal_model = is_multimodal_model(
             self.model_config.hf_config.architectures
@@ -95,9 +99,23 @@ class ModelRunner:
             self.model_config.attention_arch == AttentionArch.MLA
             and not self.server_args.disable_mla
         ):
-            logger.info("MLA optimization is tunred on. Use triton backend.")
+            logger.info("MLA optimization is turned on. Use triton backend.")
             self.server_args.attention_backend = "triton"
+        if self.server_args.enable_double_sparsity:
+            logger.info(
+                "Double sparsity optimization is turned on. Use triton backend without CUDA graph."
+            )
+            self.server_args.attention_backend = "triton"
+            self.server_args.disable_cuda_graph = True
+            if self.server_args.ds_heavy_channel_type is None:
+                raise ValueError(
+                    "Please specify the heavy channel type for double sparsity optimization."
+                )
+            self.init_double_sparsity_channel_config(
+                self.server_args.ds_heavy_channel_type
+            )
         if self.is_multimodal_model:
             logger.info(
                 "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
@@ -118,6 +136,8 @@ class ModelRunner:
                 "triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32,
                 "disable_mla": server_args.disable_mla,
                 "torchao_config": server_args.torchao_config,
+                "disable_penalizer": server_args.disable_penalizer,
+                "disable_nan_detection": server_args.disable_nan_detection,
             }
         )
@@ -132,39 +152,51 @@ class ModelRunner:
             server_args.max_running_requests,
             server_args.max_total_tokens,
         )
-        self.init_cublas()
-        self.init_attention_backend()
-        self.init_cuda_graphs()
+        if self.device == "cuda":
+            self.init_cublas()
+            self.init_attention_backend()
+            self.init_cuda_graphs()
+        else:
+            self.cuda_graph_runner = None
+            self.init_attention_backend()
     def init_torch_distributed(self):
+        logger.info("Init torch distributed begin.")
         # Init torch distributed
-        torch.cuda.set_device(self.gpu_id)
-        logger.info("Init nccl begin.")
+        if self.device == "cuda":
+            torch.cuda.set_device(self.gpu_id)
+            backend = "nccl"
+        # ToDO(liangan1):Just use gloo to bypass the initilization fail
+        # Need to use xccl for xpu backend in the future
+        elif self.device == "xpu":
+            torch.xpu.set_device(self.gpu_id)
+            backend = "gloo"
         if not self.server_args.enable_p2p_check:
             monkey_patch_vllm_p2p_access_check(self.gpu_id)
         if self.server_args.dist_init_addr:
-            nccl_init_method = f"tcp://{self.server_args.dist_init_addr}"
+            dist_init_method = f"tcp://{self.server_args.dist_init_addr}"
         else:
-            nccl_init_method = f"tcp://127.0.0.1:{self.nccl_port}"
+            dist_init_method = f"tcp://127.0.0.1:{self.dist_port}"
         set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
         init_distributed_environment(
-            backend="nccl",
+            backend=backend,
             world_size=self.tp_size,
             rank=self.tp_rank,
             local_rank=self.gpu_id,
-            distributed_init_method=nccl_init_method,
+            distributed_init_method=dist_init_method,
         )
         initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
         min_per_gpu_memory = get_available_gpu_memory(
-            self.gpu_id, distributed=self.tp_size > 1
+            self.device, self.gpu_id, distributed=self.tp_size > 1
         )
         self.tp_group = get_tp_group()
         # Currently, there is a bug with mulit-node tensor parallelsim + padded cuda graph,
         # so we disable padding in cuda graph.
-        if not all(in_the_same_node_as(self.tp_group.cpu_group, source_rank=0)):
+        if self.device == "cuda" and not all(
+            in_the_same_node_as(self.tp_group.cpu_group, source_rank=0)
+        ):
             self.server_args.disable_cuda_graph_padding = True
             logger.info(
                 "Setting disable_cuda_graph_padding to True because of multi-node tensor parallelism."
@@ -172,7 +204,7 @@ class ModelRunner:
         # Check memory for tensor parallelism
         if self.tp_size > 1:
-            local_gpu_memory = get_available_gpu_memory(self.gpu_id)
+            local_gpu_memory = get_available_gpu_memory(self.device, self.gpu_id)
             if min_per_gpu_memory < local_gpu_memory * 0.9:
                 raise ValueError(
                     "The memory capacity is unbalanced. Some GPUs may be occupied by other processes."
@@ -182,23 +214,22 @@ class ModelRunner:
     def load_model(self):
         logger.info(
-            f"Load weight begin. avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
+            f"Load weight begin. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
         )
         # This can reduce thread conflicts and speed up weight loading.
         torch.set_num_threads(1)
-        if torch.cuda.get_device_capability()[0] < 8:
-            logger.info(
-                "Compute capability below sm80. Use float16 due to lack of bfloat16 support."
-            )
-            self.server_args.dtype = "float16"
-            if torch.cuda.get_device_capability()[1] < 5:
-                raise RuntimeError("SGLang only supports sm75 and above.")
+        if self.device == "cuda":
+            if torch.cuda.get_device_capability()[0] < 8:
+                logger.info(
+                    "Compute capability below sm80. Use float16 due to lack of bfloat16 support."
+                )
+                self.server_args.dtype = "float16"
+                if torch.cuda.get_device_capability()[1] < 5:
+                    raise RuntimeError("SGLang only supports sm75 and above.")
         # Prepare the vllm model config
         monkey_patch_vllm_dummy_weight_loader()
-        self.device_config = DeviceConfig()
         self.load_config = LoadConfig(load_format=self.server_args.load_format)
         self.vllm_model_config = VllmModelConfig(
             model=self.server_args.model_path,
@@ -220,7 +251,7 @@ class ModelRunner:
         self.model = get_model(
             model_config=self.vllm_model_config,
             load_config=self.load_config,
-            device_config=self.device_config,
+            device_config=DeviceConfig(self.device),
             parallel_config=None,
             scheduler_config=None,
             lora_config=None,
@@ -240,7 +271,7 @@ class ModelRunner:
             f"Load weight end. "
             f"type={type(self.model).__name__}, "
             f"dtype={self.dtype}, "
-            f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
         )
     def update_weights(self, model_path: str, load_format: str):
@@ -254,10 +285,10 @@ class ModelRunner:
         logger.info(
             f"Update weights begin. "
-            f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
         )
-        target_device = torch.device(self.device_config.device)
+        target_device = torch.device(self.device)
         try:
             # TODO: Use a better method to check this
@@ -343,7 +374,7 @@ class ModelRunner:
     def profile_max_num_token(self, total_gpu_memory: int):
         available_gpu_memory = get_available_gpu_memory(
-            self.gpu_id, distributed=self.tp_size > 1
+            self.device, self.gpu_id, distributed=self.tp_size > 1
         )
         if (
             self.model_config.attention_arch == AttentionArch.MLA
@@ -409,11 +440,10 @@ class ModelRunner:
                 4096,
             )
-        device = "cuda"
         self.req_to_token_pool = ReqToTokenPool(
             size=max_num_reqs + 1,
             max_context_len=self.model_config.context_len + 4,
-            device=device,
+            device=self.device,
         )
         if (
             self.model_config.attention_arch == AttentionArch.MLA
@@ -425,7 +455,17 @@ class ModelRunner:
                 kv_lora_rank=self.model_config.kv_lora_rank,
                 qk_rope_head_dim=self.model_config.qk_rope_head_dim,
                 layer_num=self.model_config.num_hidden_layers,
-                device=device,
+                device=self.device,
+            )
+        elif self.server_args.enable_double_sparsity:
+            self.token_to_kv_pool = DoubleSparseTokenToKVPool(
+                self.max_total_num_tokens,
+                dtype=self.kv_cache_dtype,
+                head_num=self.model_config.get_num_kv_heads(self.tp_size),
+                head_dim=self.model_config.head_dim,
+                layer_num=self.model_config.num_hidden_layers,
+                device=self.device,
+                heavy_channel_num=self.server_args.ds_heavy_channel_num,
             )
         else:
             self.token_to_kv_pool = MHATokenToKVPool(
@@ -434,11 +474,11 @@ class ModelRunner:
                 head_num=self.model_config.get_num_kv_heads(self.tp_size),
                 head_dim=self.model_config.head_dim,
                 layer_num=self.model_config.num_hidden_layers,
-                device=device,
+                device=self.device,
             )
         logger.info(
             f"Memory pool end. "
-            f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
         )
     def init_cublas(self):
@@ -463,12 +503,33 @@ class ModelRunner:
                 "Cross attention is not supported in the triton attention backend. "
                 "Please use `--attention-backend flashinfer`."
             )
-            self.attn_backend = TritonAttnBackend(self)
+            if self.server_args.enable_double_sparsity:
+                self.attn_backend = DoubleSparseAttnBackend(self)
+            else:
+                self.attn_backend = TritonAttnBackend(self)
         else:
             raise ValueError(
                 f"Invalid attention backend: {self.server_args.attention_backend}"
             )
+    def init_double_sparsity_channel_config(self, selected_channel):
+        selected_channel = "." + selected_channel + "_proj"
+        self.sorted_channels = []
+        # load channel config
+        with open(self.server_args.ds_channel_config_path, "r") as f:
+            channel_config = json.load(f)
+        for i in range(self.model_config.num_hidden_layers):
+            key = "model.layers." + str(i) + ".self_attn" + selected_channel
+            self.sorted_channels.append(
+                torch.tensor(channel_config[key])[
+                    :, : self.server_args.ds_heavy_channel_num
+                ]
+                .contiguous()
+                .cuda()
+            )
     def init_cuda_graphs(self):
         """Capture cuda graphs."""
         from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
@@ -491,11 +552,14 @@ class ModelRunner:
         ):
             return self.cuda_graph_runner.replay(forward_batch)
+        forward_batch.positions = (forward_batch.seq_lens - 1).to(torch.int64)
+        self.attn_backend.init_forward_metadata(forward_batch)
         return self.model.forward(
             forward_batch.input_ids, forward_batch.positions, forward_batch
         )
     def forward_extend(self, forward_batch: ForwardBatch):
+        self.attn_backend.init_forward_metadata(forward_batch)
         if self.is_generation:
             return self.model.forward(
                 forward_batch.input_ids, forward_batch.positions, forward_batch

sglang/srt/models/baichuan.py CHANGED Viewed

@@ -24,7 +24,6 @@ from typing import Iterable, Optional, Tuple
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.config import CacheConfig
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -330,7 +329,7 @@ class BaiChuanBaseForCausalLM(nn.Module):
         self,
         config: PretrainedConfig,
         position_embedding: str,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
@@ -404,7 +403,7 @@ class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
     def __init__(
         self,
         config,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         if config.hidden_size == 4096:  # baichuan2 7b

sglang/srt/models/chatglm.py CHANGED Viewed

@@ -22,7 +22,6 @@ from typing import Iterable, Optional, Tuple
 import torch
 from torch import nn
 from torch.nn import LayerNorm
-from vllm.config import CacheConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -52,7 +51,7 @@ class GLMAttention(nn.Module):
         self,
         config,
         layer_id: int = 0,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
@@ -188,7 +187,7 @@ class GLMBlock(nn.Module):
         self,
         config,
         layer_id: int,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
@@ -260,7 +259,7 @@ class GLMTransformer(nn.Module):
     def __init__(
         self,
         config,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
@@ -308,7 +307,7 @@ class ChatGLMModel(nn.Module):
     def __init__(
         self,
         config,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
@@ -359,7 +358,7 @@ class ChatGLMForCausalLM(nn.Module):
     def __init__(
         self,
         config: ChatGLMConfig,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoraConfig] = None,
     ):

sglang/srt/models/commandr.py CHANGED Viewed

@@ -45,7 +45,6 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn.parameter import Parameter
 from transformers import PretrainedConfig
-from vllm.config import CacheConfig
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -320,7 +319,7 @@ class CohereForCausalLM(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
     ) -> None:
         super().__init__()
         self.config = config

sglang/srt/models/dbrx.py CHANGED Viewed

@@ -20,7 +20,6 @@ from typing import Iterable, Optional, Tuple
 import torch
 import torch.nn as nn
-from vllm.config import CacheConfig
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -368,7 +367,7 @@ class DbrxForCausalLM(nn.Module):
         self,
         config: DbrxConfig,
         quant_config: Optional[QuantizationConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
     ):
         super().__init__()
         self.config = config

sglang/srt/models/deepseek.py CHANGED Viewed

@@ -21,7 +21,6 @@ from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.config import CacheConfig
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -185,7 +184,7 @@ class DeepseekAttention(nn.Module):
         rope_theta: float = 10000,
         rope_scaling: Optional[Dict[str, Any]] = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
@@ -262,7 +261,7 @@ class DeepseekDecoderLayer(nn.Module):
         self,
         config: PretrainedConfig,
         layer_id: int,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
@@ -331,7 +330,7 @@ class DeepseekModel(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
@@ -374,7 +373,7 @@ class DeepseekForCausalLM(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -21,7 +21,6 @@ from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.config import CacheConfig
 from vllm.distributed import (
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
@@ -188,7 +187,7 @@ class DeepseekV2Attention(nn.Module):
         rope_theta: float = 10000,
         rope_scaling: Optional[Dict[str, Any]] = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
         layer_id=None,
     ) -> None:
@@ -336,7 +335,7 @@ class DeepseekV2AttentionMLA(nn.Module):
         rope_theta: float = 10000,
         rope_scaling: Optional[Dict[str, Any]] = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
         layer_id=None,
     ) -> None:
@@ -498,7 +497,7 @@ class DeepseekV2DecoderLayer(nn.Module):
         self,
         config: PretrainedConfig,
         layer_id: int,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
@@ -594,7 +593,7 @@ class DeepseekV2Model(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
@@ -640,7 +639,7 @@ class DeepseekV2ForCausalLM(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()

sglang/srt/models/exaone.py CHANGED Viewed

@@ -21,7 +21,6 @@ from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
 from torch import nn
-from vllm.config import CacheConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -295,7 +294,7 @@ class ExaoneForCausalLM(nn.Module):
         self,
         config,
         quant_config: Optional[QuantizationConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
     ) -> None:
         super().__init__()
         self.config = config

sglang/srt/models/gemma.py CHANGED Viewed

@@ -21,7 +21,7 @@ from typing import Iterable, Optional, Tuple
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import LoRAConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
@@ -279,7 +279,7 @@ class GemmaForCausalLM(nn.Module):
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
     ) -> None:
         del lora_config  # Unused.
         super().__init__()

sglang/srt/models/gemma2.py CHANGED Viewed

@@ -20,7 +20,7 @@ from typing import Iterable, Optional, Set, Tuple, Union
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import LoRAConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 # from vllm.model_executor.layers.rotary_embedding import GemmaRotaryEmbedding
@@ -105,7 +105,7 @@ class Gemma2Attention(nn.Module):
         head_dim: int,
         max_position_embeddings: int,
         rope_theta: float,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
@@ -190,7 +190,7 @@ class Gemma2DecoderLayer(nn.Module):
         self,
         layer_idx: int,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
@@ -257,7 +257,7 @@ class Gemma2Model(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
@@ -336,7 +336,7 @@ class Gemma2ForCausalLM(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:

sglang/srt/models/gpt_bigcode.py CHANGED Viewed

@@ -21,7 +21,7 @@ from typing import Iterable, Optional, Tuple
 import torch
 from torch import nn
 from transformers import GPTBigCodeConfig
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import LoRAConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -44,7 +44,7 @@ class GPTBigCodeAttention(nn.Module):
         self,
         layer_id: int,
         config: GPTBigCodeConfig,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
@@ -145,7 +145,7 @@ class GPTBigCodeBlock(nn.Module):
         self,
         layer_id: int,
         config: GPTBigCodeConfig,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
@@ -183,7 +183,7 @@ class GPTBigCodeModel(nn.Module):
     def __init__(
         self,
         config: GPTBigCodeConfig,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ):
@@ -243,7 +243,7 @@ class GPTBigCodeForCausalLM(nn.Module):
     def __init__(
         self,
         config: GPTBigCodeConfig,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ):

sglang/srt/models/grok.py CHANGED Viewed

@@ -23,7 +23,6 @@ import torch
 import torch.nn.functional as F
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.config import CacheConfig
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -289,7 +288,7 @@ class Grok1ForCausalLM(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
     ) -> None:
         super().__init__()
         self.config = config

sglang/srt/models/internlm2.py CHANGED Viewed

@@ -21,7 +21,6 @@ from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.config import CacheConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -254,7 +253,7 @@ class InternLM2ForCausalLM(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config=None,
     ) -> None:
         super().__init__()
         self.config = config

sglang 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl

sglang 0.3.3py3-none-any.whl → 0.3.4py3-none-any.whl