sglang 0.3.3.post1__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. sglang/bench_latency.py +28 -10
  2. sglang/bench_server_latency.py +21 -10
  3. sglang/bench_serving.py +101 -7
  4. sglang/global_config.py +0 -1
  5. sglang/srt/layers/attention/__init__.py +27 -5
  6. sglang/srt/layers/attention/double_sparsity_backend.py +281 -0
  7. sglang/srt/layers/attention/flashinfer_backend.py +352 -83
  8. sglang/srt/layers/attention/triton_backend.py +6 -4
  9. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +772 -0
  10. sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -3
  11. sglang/srt/layers/attention/triton_ops/prefill_attention.py +4 -2
  12. sglang/srt/layers/sampler.py +6 -2
  13. sglang/srt/managers/detokenizer_manager.py +31 -10
  14. sglang/srt/managers/io_struct.py +4 -0
  15. sglang/srt/managers/schedule_batch.py +120 -43
  16. sglang/srt/managers/schedule_policy.py +2 -1
  17. sglang/srt/managers/scheduler.py +202 -140
  18. sglang/srt/managers/tokenizer_manager.py +5 -1
  19. sglang/srt/managers/tp_worker.py +111 -1
  20. sglang/srt/mem_cache/chunk_cache.py +8 -4
  21. sglang/srt/mem_cache/memory_pool.py +77 -4
  22. sglang/srt/mem_cache/radix_cache.py +15 -7
  23. sglang/srt/model_executor/cuda_graph_runner.py +4 -4
  24. sglang/srt/model_executor/forward_batch_info.py +16 -21
  25. sglang/srt/model_executor/model_runner.py +60 -1
  26. sglang/srt/models/baichuan.py +2 -3
  27. sglang/srt/models/chatglm.py +5 -6
  28. sglang/srt/models/commandr.py +1 -2
  29. sglang/srt/models/dbrx.py +1 -2
  30. sglang/srt/models/deepseek.py +4 -5
  31. sglang/srt/models/deepseek_v2.py +5 -6
  32. sglang/srt/models/exaone.py +1 -2
  33. sglang/srt/models/gemma.py +2 -2
  34. sglang/srt/models/gemma2.py +5 -5
  35. sglang/srt/models/gpt_bigcode.py +5 -5
  36. sglang/srt/models/grok.py +1 -2
  37. sglang/srt/models/internlm2.py +1 -2
  38. sglang/srt/models/llama.py +1 -2
  39. sglang/srt/models/llama_classification.py +1 -2
  40. sglang/srt/models/llama_reward.py +2 -3
  41. sglang/srt/models/llava.py +4 -8
  42. sglang/srt/models/llavavid.py +1 -2
  43. sglang/srt/models/minicpm.py +1 -2
  44. sglang/srt/models/minicpm3.py +5 -6
  45. sglang/srt/models/mixtral.py +1 -2
  46. sglang/srt/models/mixtral_quant.py +1 -2
  47. sglang/srt/models/olmo.py +352 -0
  48. sglang/srt/models/olmoe.py +1 -2
  49. sglang/srt/models/qwen.py +1 -2
  50. sglang/srt/models/qwen2.py +1 -2
  51. sglang/srt/models/qwen2_moe.py +4 -5
  52. sglang/srt/models/stablelm.py +1 -2
  53. sglang/srt/models/torch_native_llama.py +1 -2
  54. sglang/srt/models/xverse.py +1 -2
  55. sglang/srt/models/xverse_moe.py +4 -5
  56. sglang/srt/models/yivl.py +1 -2
  57. sglang/srt/openai_api/adapter.py +92 -49
  58. sglang/srt/openai_api/protocol.py +10 -2
  59. sglang/srt/sampling/penaltylib/orchestrator.py +28 -9
  60. sglang/srt/sampling/sampling_batch_info.py +92 -58
  61. sglang/srt/sampling/sampling_params.py +2 -0
  62. sglang/srt/server.py +116 -17
  63. sglang/srt/server_args.py +121 -45
  64. sglang/srt/utils.py +11 -3
  65. sglang/test/few_shot_gsm8k.py +4 -1
  66. sglang/test/few_shot_gsm8k_engine.py +144 -0
  67. sglang/test/srt/sampling/penaltylib/utils.py +16 -12
  68. sglang/version.py +1 -1
  69. {sglang-0.3.3.post1.dist-info → sglang-0.3.4.dist-info}/METADATA +72 -29
  70. {sglang-0.3.3.post1.dist-info → sglang-0.3.4.dist-info}/RECORD +73 -70
  71. {sglang-0.3.3.post1.dist-info → sglang-0.3.4.dist-info}/WHEEL +1 -1
  72. sglang/srt/layers/attention/flashinfer_utils.py +0 -237
  73. {sglang-0.3.3.post1.dist-info → sglang-0.3.4.dist-info}/LICENSE +0 -0
  74. {sglang-0.3.3.post1.dist-info → sglang-0.3.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,352 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
16
+ # Adapted from
17
+ # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/olmo.py#L1
18
+ """Inference-only OLMo model compatible with HuggingFace weights."""
19
+ from typing import Iterable, List, Optional, Tuple
20
+
21
+ import torch
22
+ from torch import nn
23
+ from transformers import OlmoConfig
24
+ from vllm.distributed import get_tensor_model_parallel_world_size
25
+ from vllm.model_executor.layers.rotary_embedding import get_rope
26
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
27
+ ParallelLMHead,
28
+ VocabParallelEmbedding,
29
+ )
30
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
31
+
32
+ from sglang.srt.layers.activation import SiluAndMul
33
+ from sglang.srt.layers.linear import (
34
+ MergedColumnParallelLinear,
35
+ QKVParallelLinear,
36
+ RowParallelLinear,
37
+ )
38
+ from sglang.srt.layers.logits_processor import LogitsProcessor
39
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
40
+ from sglang.srt.layers.radix_attention import RadixAttention
41
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
42
+
43
+
44
+ class OlmoAttention(nn.Module):
45
+ """
46
+ This is the attention block where the output is computed as
47
+ ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
48
+ (plus another skip connection).
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ config: OlmoConfig,
54
+ layer_id: int = 0,
55
+ quant_config: Optional[QuantizationConfig] = None,
56
+ ):
57
+ super().__init__()
58
+ self.config = config
59
+ self.hidden_size = config.hidden_size
60
+ tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
61
+ self.total_num_heads = config.num_attention_heads
62
+
63
+ assert self.hidden_size % self.total_num_heads == 0
64
+ assert self.total_num_heads % tensor_model_parallel_world_size == 0
65
+
66
+ self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
67
+ self.head_dim = self.hidden_size // self.total_num_heads
68
+ self.max_position_embeddings = config.max_position_embeddings
69
+ self.rope_theta = config.rope_theta
70
+ self.clip_qkv = config.clip_qkv
71
+
72
+ # Attention input projection. Projects x -> (q, k, v)
73
+ self.qkv_proj = QKVParallelLinear(
74
+ self.hidden_size,
75
+ self.head_dim,
76
+ self.total_num_heads,
77
+ bias=config.attention_bias,
78
+ )
79
+
80
+ # Rotary embeddings.
81
+ self.rotary_emb = get_rope(
82
+ self.head_dim,
83
+ rotary_dim=self.head_dim,
84
+ max_position=self.max_position_embeddings,
85
+ base=self.rope_theta,
86
+ )
87
+ self.scaling = self.head_dim**-0.5
88
+ self.attn = RadixAttention(
89
+ self.num_heads,
90
+ self.head_dim,
91
+ self.scaling,
92
+ num_kv_heads=self.num_heads,
93
+ layer_id=layer_id,
94
+ )
95
+
96
+ # Attention output projection.
97
+ self.o_proj = RowParallelLinear(
98
+ self.hidden_size,
99
+ self.hidden_size,
100
+ bias=config.attention_bias,
101
+ )
102
+
103
+ def forward(
104
+ self,
105
+ positions: torch.Tensor,
106
+ hidden_states: torch.Tensor,
107
+ forward_batch: ForwardBatch,
108
+ ) -> torch.Tensor:
109
+ qkv, _ = self.qkv_proj(hidden_states)
110
+ if self.clip_qkv is not None:
111
+ qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
112
+ q, k, v = qkv.chunk(chunks=3, dim=-1)
113
+ q, k = self.rotary_emb(positions, q, k)
114
+ attn_output = self.attn(q, k, v, forward_batch)
115
+ output, _ = self.o_proj(attn_output)
116
+ return output
117
+
118
+
119
+ class OlmoMLP(nn.Module):
120
+ """
121
+ This is the MLP block where the output is computed as
122
+ ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
123
+ (plus another skip connection).
124
+ """
125
+
126
+ def __init__(
127
+ self,
128
+ config: OlmoConfig,
129
+ quant_config: Optional[QuantizationConfig] = None,
130
+ ):
131
+ super().__init__()
132
+ self.config = config
133
+ self.hidden_size = config.hidden_size
134
+ self.intermediate_size = config.intermediate_size
135
+
136
+ # Feed-forward input projection.
137
+ self.gate_up_proj = MergedColumnParallelLinear(
138
+ self.hidden_size,
139
+ [self.intermediate_size] * 2,
140
+ bias=False,
141
+ quant_config=quant_config,
142
+ )
143
+
144
+ # Activation function.
145
+ self.act_fn = SiluAndMul()
146
+
147
+ # Feed-forward output projection.
148
+ self.down_proj = RowParallelLinear(
149
+ self.intermediate_size,
150
+ self.hidden_size,
151
+ bias=False,
152
+ quant_config=quant_config,
153
+ )
154
+
155
+ def forward(
156
+ self,
157
+ x: torch.Tensor,
158
+ ) -> torch.Tensor:
159
+ gate_up, _ = self.gate_up_proj(x)
160
+ x = self.act_fn(gate_up)
161
+ x, _ = self.down_proj(x)
162
+ return x
163
+
164
+
165
+ class OlmoDecoderLayer(nn.Module):
166
+ """
167
+ This is a typical transformer block where the output is
168
+ computed as ``MLP(LN(x + Attention(LN(x))))``
169
+ (plus another skip connection).
170
+ """
171
+
172
+ def __init__(
173
+ self,
174
+ config: OlmoConfig,
175
+ layer_id: int = 0,
176
+ quant_config: Optional[QuantizationConfig] = None,
177
+ ):
178
+ super().__init__()
179
+ # Attention block.
180
+ self.self_attn = OlmoAttention(config, layer_id, quant_config)
181
+
182
+ # MLP block.
183
+ self.mlp = OlmoMLP(config, quant_config)
184
+
185
+ # LayerNorm
186
+ self.input_layernorm = nn.LayerNorm(
187
+ config.hidden_size, elementwise_affine=False, bias=False
188
+ )
189
+ self.post_attention_layernorm = nn.LayerNorm(
190
+ config.hidden_size, elementwise_affine=False, bias=False
191
+ )
192
+
193
+ def forward(
194
+ self,
195
+ positions: torch.Tensor,
196
+ hidden_states: torch.Tensor,
197
+ forward_batch: ForwardBatch,
198
+ ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
199
+ # Attention block.
200
+ residual = hidden_states
201
+ hidden_states = self.input_layernorm(hidden_states)
202
+ hidden_states = self.self_attn(positions, hidden_states, forward_batch)
203
+ hidden_states = hidden_states + residual
204
+
205
+ # MLP block.
206
+ residual = hidden_states
207
+ hidden_states = self.post_attention_layernorm(hidden_states)
208
+ hidden_states = self.mlp(hidden_states)
209
+ hidden_states = residual + hidden_states
210
+ return hidden_states
211
+
212
+
213
+ class OlmoModel(nn.Module):
214
+
215
+ def __init__(
216
+ self, config: OlmoConfig, quant_config: Optional[QuantizationConfig] = None
217
+ ):
218
+ super().__init__()
219
+ self.config = config
220
+
221
+ self.embed_tokens = VocabParallelEmbedding(
222
+ config.vocab_size, config.hidden_size
223
+ )
224
+ self.layers = nn.ModuleList(
225
+ [
226
+ OlmoDecoderLayer(config, layer_idx, quant_config)
227
+ for layer_idx in range(config.num_hidden_layers)
228
+ ]
229
+ )
230
+ self.norm = nn.LayerNorm(
231
+ config.hidden_size, elementwise_affine=False, bias=False
232
+ )
233
+
234
+ def forward(
235
+ self,
236
+ input_ids: torch.Tensor,
237
+ positions: torch.Tensor,
238
+ forward_batch: ForwardBatch,
239
+ input_embeds: torch.Tensor = None,
240
+ ) -> torch.Tensor:
241
+ """
242
+ :param input_ids: A tensor of shape `(batch_size, seq_len)`.
243
+ """
244
+ # Get embeddings of input.
245
+ # shape: (batch_size, seq_len, d_model)
246
+
247
+ if input_embeds is None:
248
+ hidden_states = self.embed_tokens(input_ids)
249
+ else:
250
+ hidden_states = input_embeds
251
+
252
+ # Apply blocks one-by-one.
253
+ for layer_idx, decoder_layer in enumerate(self.layers):
254
+ # shape: (batch_size, seq_len, d_model)
255
+ hidden_states = decoder_layer(
256
+ positions,
257
+ hidden_states,
258
+ forward_batch,
259
+ )
260
+
261
+ # Apply final layer norm.
262
+ # shape: (batch_size, seq_len or 1, d_model)
263
+ hidden_states = self.norm(hidden_states)
264
+ return hidden_states
265
+
266
+
267
+ class OlmoForCausalLM(nn.Module):
268
+ """
269
+ Extremely barebones HF model wrapper.
270
+ """
271
+
272
+ def __init__(
273
+ self,
274
+ config: OlmoConfig,
275
+ cache_config=None,
276
+ quant_config: Optional[QuantizationConfig] = None,
277
+ ):
278
+ super().__init__()
279
+ self.config = config
280
+ self.model = OlmoModel(config, quant_config)
281
+ if config.tie_word_embeddings:
282
+ self.lm_head = self.model.embed_tokens
283
+ else:
284
+ self.unpadded_vocab_size = config.vocab_size
285
+ self.lm_head = ParallelLMHead(
286
+ self.unpadded_vocab_size,
287
+ config.hidden_size,
288
+ org_num_embeddings=config.vocab_size,
289
+ quant_config=quant_config,
290
+ )
291
+ self.logits_processor = LogitsProcessor(config)
292
+
293
+ def forward(
294
+ self,
295
+ input_ids: torch.Tensor,
296
+ positions: torch.Tensor,
297
+ forward_batch: ForwardBatch,
298
+ input_embeds: torch.Tensor = None,
299
+ ) -> torch.Tensor:
300
+ hidden_states = self.model(
301
+ input_ids=input_ids,
302
+ positions=positions,
303
+ forward_batch=forward_batch,
304
+ input_embeds=input_embeds,
305
+ )
306
+ return self.logits_processor(
307
+ input_ids, hidden_states, self.lm_head.weight, forward_batch
308
+ )
309
+
310
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
311
+ stacked_params_mapping = [
312
+ # (param_name, shard_name, shard_id)
313
+ ("qkv_proj", "q_proj", "q"),
314
+ ("qkv_proj", "k_proj", "k"),
315
+ ("qkv_proj", "v_proj", "v"),
316
+ ("gate_up_proj", "gate_proj", 0),
317
+ ("gate_up_proj", "up_proj", 1),
318
+ ]
319
+ params_dict = dict(self.named_parameters(remove_duplicate=False))
320
+ for name, loaded_weight in weights:
321
+ if "rotary_emb.inv_freq" in name:
322
+ continue
323
+ if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
324
+ # Models trained using ColossalAI may include these tensors in
325
+ # the checkpoint. Skip them.
326
+ continue
327
+ # With tie_word_embeddings, we can skip lm_head.weight
328
+ # The weight might appear unnecessarily in the files if the model is
329
+ # processed with quantization, LoRA, fine-tuning, etc.
330
+ if self.config.tie_word_embeddings and "lm_head.weight" in name:
331
+ continue
332
+ for param_name, weight_name, shard_id in stacked_params_mapping:
333
+ if weight_name not in name:
334
+ continue
335
+ name = name.replace(weight_name, param_name)
336
+ # Skip loading extra bias for GPTQ models.
337
+ if name.endswith(".bias") and name not in params_dict:
338
+ continue
339
+ param = params_dict[name]
340
+ weight_loader = param.weight_loader
341
+ weight_loader(param, loaded_weight, shard_id)
342
+ break
343
+ else:
344
+ # Skip loading extra bias for GPTQ models.
345
+ if name.endswith(".bias") and name not in params_dict:
346
+ continue
347
+ param = params_dict[name]
348
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
349
+ weight_loader(param, loaded_weight)
350
+
351
+
352
+ EntryClass = OlmoForCausalLM
@@ -23,7 +23,6 @@ import torch
23
23
  import torch.nn.functional as F
24
24
  from torch import nn
25
25
  from transformers import PretrainedConfig
26
- from vllm.config import CacheConfig
27
26
  from vllm.distributed import (
28
27
  get_tensor_model_parallel_world_size,
29
28
  tensor_model_parallel_all_reduce,
@@ -298,7 +297,7 @@ class OlmoeForCausalLM(nn.Module):
298
297
  def __init__(
299
298
  self,
300
299
  config: PretrainedConfig,
301
- cache_config: Optional[CacheConfig] = None,
300
+ cache_config=None,
302
301
  quant_config: Optional[QuantizationConfig] = None,
303
302
  ) -> None:
304
303
  super().__init__()
sglang/srt/models/qwen.py CHANGED
@@ -20,7 +20,6 @@ from typing import Any, Dict, Iterable, Optional, Tuple
20
20
  import torch
21
21
  from torch import nn
22
22
  from transformers import PretrainedConfig
23
- from vllm.config import CacheConfig
24
23
  from vllm.distributed import get_tensor_model_parallel_world_size
25
24
  from vllm.model_executor.layers.rotary_embedding import get_rope
26
25
  from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -243,7 +242,7 @@ class QWenLMHeadModel(nn.Module):
243
242
  self,
244
243
  config: PretrainedConfig,
245
244
  quant_config: Optional[QuantizationConfig] = None,
246
- cache_config: Optional[CacheConfig] = None,
245
+ cache_config=None,
247
246
  ):
248
247
  super().__init__()
249
248
  self.config = config
@@ -20,7 +20,6 @@ from typing import Any, Dict, Iterable, Optional, Tuple
20
20
 
21
21
  import torch
22
22
  from torch import nn
23
- from vllm.config import CacheConfig
24
23
  from vllm.distributed import get_tensor_model_parallel_world_size
25
24
  from vllm.model_executor.layers.rotary_embedding import get_rope
26
25
  from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -268,7 +267,7 @@ class Qwen2ForCausalLM(nn.Module):
268
267
  self,
269
268
  config: Qwen2Config,
270
269
  quant_config: Optional[QuantizationConfig] = None,
271
- cache_config: Optional[CacheConfig] = None,
270
+ cache_config=None,
272
271
  ) -> None:
273
272
  super().__init__()
274
273
  self.config = config
@@ -23,7 +23,6 @@ import torch
23
23
  import torch.nn.functional as F
24
24
  from torch import nn
25
25
  from transformers import PretrainedConfig
26
- from vllm.config import CacheConfig
27
26
  from vllm.distributed import (
28
27
  get_tensor_model_parallel_world_size,
29
28
  tensor_model_parallel_all_reduce,
@@ -160,7 +159,7 @@ class Qwen2MoeAttention(nn.Module):
160
159
  rope_theta: float = 10000,
161
160
  rope_scaling: Optional[Dict[str, Any]] = None,
162
161
  max_position_embeddings: int = 8192,
163
- cache_config: Optional[CacheConfig] = None,
162
+ cache_config=None,
164
163
  quant_config: Optional[QuantizationConfig] = None,
165
164
  ) -> None:
166
165
  super().__init__()
@@ -236,7 +235,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
236
235
  self,
237
236
  config: PretrainedConfig,
238
237
  layer_id: int,
239
- cache_config: Optional[CacheConfig] = None,
238
+ cache_config=None,
240
239
  quant_config: Optional[QuantizationConfig] = None,
241
240
  ) -> None:
242
241
  super().__init__()
@@ -306,7 +305,7 @@ class Qwen2MoeModel(nn.Module):
306
305
  def __init__(
307
306
  self,
308
307
  config: PretrainedConfig,
309
- cache_config: Optional[CacheConfig] = None,
308
+ cache_config=None,
310
309
  quant_config: Optional[QuantizationConfig] = None,
311
310
  ) -> None:
312
311
  super().__init__()
@@ -355,7 +354,7 @@ class Qwen2MoeForCausalLM(nn.Module):
355
354
  def __init__(
356
355
  self,
357
356
  config: PretrainedConfig,
358
- cache_config: Optional[CacheConfig] = None,
357
+ cache_config=None,
359
358
  quant_config: Optional[QuantizationConfig] = None,
360
359
  ) -> None:
361
360
  super().__init__()
@@ -22,7 +22,6 @@ from typing import Iterable, Optional, Tuple
22
22
  import torch
23
23
  from torch import nn
24
24
  from transformers import PretrainedConfig
25
- from vllm.config import CacheConfig
26
25
  from vllm.distributed import get_tensor_model_parallel_world_size
27
26
  from vllm.model_executor.layers.rotary_embedding import get_rope
28
27
  from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -241,7 +240,7 @@ class StableLmForCausalLM(nn.Module):
241
240
  self,
242
241
  config: PretrainedConfig,
243
242
  quant_config: Optional[QuantizationConfig] = None,
244
- cache_config: Optional[CacheConfig] = None,
243
+ cache_config=None,
245
244
  ) -> None:
246
245
  super().__init__()
247
246
  self.config = config
@@ -24,7 +24,6 @@ import torch
24
24
  from torch import nn
25
25
  from torch.nn.parameter import Parameter
26
26
  from transformers import LlamaConfig
27
- from vllm.config import CacheConfig
28
27
  from vllm.distributed import get_tensor_model_parallel_world_size
29
28
  from vllm.model_executor.layers.rotary_embedding import get_rope
30
29
  from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -380,7 +379,7 @@ class TorchNativeLlamaForCausalLM(nn.Module):
380
379
  self,
381
380
  config: LlamaConfig,
382
381
  quant_config: Optional[QuantizationConfig] = None,
383
- cache_config: Optional[CacheConfig] = None,
382
+ cache_config=None,
384
383
  ) -> None:
385
384
  super().__init__()
386
385
  self.config = config
@@ -22,7 +22,6 @@ from typing import Any, Dict, Iterable, Optional, Tuple
22
22
  import torch
23
23
  from torch import nn
24
24
  from transformers import LlamaConfig
25
- from vllm.config import CacheConfig
26
25
  from vllm.distributed import get_tensor_model_parallel_world_size
27
26
  from vllm.model_executor.layers.activation import SiluAndMul
28
27
  from vllm.model_executor.layers.layernorm import RMSNorm
@@ -297,7 +296,7 @@ class XverseForCausalLM(nn.Module):
297
296
  self,
298
297
  config: LlamaConfig,
299
298
  quant_config: Optional[QuantizationConfig] = None,
300
- cache_config: Optional[CacheConfig] = None,
299
+ cache_config=None,
301
300
  efficient_weight_load=False,
302
301
  ) -> None:
303
302
  super().__init__()
@@ -19,7 +19,6 @@ from typing import Any, Dict, Iterable, Optional, Tuple
19
19
  import torch
20
20
  from torch import nn
21
21
  from transformers import PretrainedConfig
22
- from vllm.config import CacheConfig
23
22
  from vllm.distributed import (
24
23
  get_tensor_model_parallel_rank,
25
24
  get_tensor_model_parallel_world_size,
@@ -183,7 +182,7 @@ class XverseAttention(nn.Module):
183
182
  rope_theta: float = 10000,
184
183
  rope_scaling: Optional[Dict[str, Any]] = None,
185
184
  max_position_embeddings: int = 8192,
186
- cache_config: Optional[CacheConfig] = None,
185
+ cache_config=None,
187
186
  quant_config: Optional[QuantizationConfig] = None,
188
187
  ) -> None:
189
188
  super().__init__()
@@ -260,7 +259,7 @@ class XverseDecoderLayer(nn.Module):
260
259
  self,
261
260
  config: PretrainedConfig,
262
261
  layer_id: int,
263
- cache_config: Optional[CacheConfig] = None,
262
+ cache_config=None,
264
263
  quant_config: Optional[QuantizationConfig] = None,
265
264
  ) -> None:
266
265
  super().__init__()
@@ -328,7 +327,7 @@ class XverseModel(nn.Module):
328
327
  def __init__(
329
328
  self,
330
329
  config: PretrainedConfig,
331
- cache_config: Optional[CacheConfig] = None,
330
+ cache_config=None,
332
331
  quant_config: Optional[QuantizationConfig] = None,
333
332
  ) -> None:
334
333
  super().__init__()
@@ -371,7 +370,7 @@ class XverseMoeForCausalLM(nn.Module):
371
370
  def __init__(
372
371
  self,
373
372
  config: PretrainedConfig,
374
- cache_config: Optional[CacheConfig] = None,
373
+ cache_config=None,
375
374
  quant_config: Optional[QuantizationConfig] = None,
376
375
  ) -> None:
377
376
  super().__init__()
sglang/srt/models/yivl.py CHANGED
@@ -20,7 +20,6 @@ from typing import Iterable, Optional, Tuple
20
20
  import torch
21
21
  import torch.nn as nn
22
22
  from transformers import CLIPVisionModel, LlavaConfig
23
- from vllm.config import CacheConfig
24
23
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
25
24
 
26
25
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
@@ -32,7 +31,7 @@ class YiVLForCausalLM(LlavaLlamaForCausalLM):
32
31
  self,
33
32
  config: LlavaConfig,
34
33
  quant_config: Optional[QuantizationConfig] = None,
35
- cache_config: Optional[CacheConfig] = None,
34
+ cache_config=None,
36
35
  ) -> None:
37
36
  super().__init__(config, quant_config, cache_config)
38
37