sglang 0.3.4.post2__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. sglang/api.py +1 -1
  2. sglang/bench_latency.py +3 -3
  3. sglang/bench_server_latency.py +2 -3
  4. sglang/bench_serving.py +92 -0
  5. sglang/global_config.py +9 -3
  6. sglang/lang/chat_template.py +50 -25
  7. sglang/lang/interpreter.py +9 -1
  8. sglang/lang/ir.py +11 -2
  9. sglang/launch_server.py +1 -1
  10. sglang/srt/configs/model_config.py +51 -13
  11. sglang/srt/constrained/__init__.py +18 -0
  12. sglang/srt/constrained/bnf_cache.py +61 -0
  13. sglang/srt/constrained/grammar.py +190 -0
  14. sglang/srt/hf_transformers_utils.py +6 -5
  15. sglang/srt/layers/attention/triton_ops/decode_attention.py +110 -30
  16. sglang/srt/layers/attention/triton_ops/prefill_attention.py +1 -1
  17. sglang/srt/layers/fused_moe/fused_moe.py +4 -3
  18. sglang/srt/layers/fused_moe/layer.py +28 -0
  19. sglang/srt/layers/quantization/base_config.py +16 -1
  20. sglang/srt/layers/vocab_parallel_embedding.py +486 -0
  21. sglang/srt/managers/data_parallel_controller.py +7 -6
  22. sglang/srt/managers/detokenizer_manager.py +9 -11
  23. sglang/srt/managers/image_processor.py +4 -3
  24. sglang/srt/managers/io_struct.py +70 -78
  25. sglang/srt/managers/schedule_batch.py +33 -49
  26. sglang/srt/managers/schedule_policy.py +24 -13
  27. sglang/srt/managers/scheduler.py +137 -80
  28. sglang/srt/managers/tokenizer_manager.py +224 -336
  29. sglang/srt/managers/tp_worker.py +5 -5
  30. sglang/srt/mem_cache/flush_cache.py +1 -1
  31. sglang/srt/model_executor/cuda_graph_runner.py +7 -4
  32. sglang/srt/model_executor/model_runner.py +8 -17
  33. sglang/srt/models/baichuan.py +4 -4
  34. sglang/srt/models/chatglm.py +4 -4
  35. sglang/srt/models/commandr.py +1 -1
  36. sglang/srt/models/dbrx.py +5 -5
  37. sglang/srt/models/deepseek.py +4 -4
  38. sglang/srt/models/deepseek_v2.py +4 -4
  39. sglang/srt/models/exaone.py +4 -4
  40. sglang/srt/models/gemma.py +1 -1
  41. sglang/srt/models/gemma2.py +1 -1
  42. sglang/srt/models/gpt2.py +287 -0
  43. sglang/srt/models/gpt_bigcode.py +1 -1
  44. sglang/srt/models/grok.py +4 -4
  45. sglang/srt/models/internlm2.py +4 -4
  46. sglang/srt/models/llama.py +15 -7
  47. sglang/srt/models/llama_embedding.py +2 -10
  48. sglang/srt/models/llama_reward.py +5 -0
  49. sglang/srt/models/minicpm.py +4 -4
  50. sglang/srt/models/minicpm3.py +4 -4
  51. sglang/srt/models/mixtral.py +7 -5
  52. sglang/srt/models/mixtral_quant.py +4 -4
  53. sglang/srt/models/mllama.py +5 -5
  54. sglang/srt/models/olmo.py +4 -4
  55. sglang/srt/models/olmoe.py +4 -4
  56. sglang/srt/models/qwen.py +4 -4
  57. sglang/srt/models/qwen2.py +4 -4
  58. sglang/srt/models/qwen2_moe.py +4 -4
  59. sglang/srt/models/qwen2_vl.py +4 -8
  60. sglang/srt/models/stablelm.py +4 -4
  61. sglang/srt/models/torch_native_llama.py +4 -4
  62. sglang/srt/models/xverse.py +4 -4
  63. sglang/srt/models/xverse_moe.py +4 -4
  64. sglang/srt/openai_api/adapter.py +52 -66
  65. sglang/srt/sampling/sampling_batch_info.py +7 -13
  66. sglang/srt/server.py +31 -35
  67. sglang/srt/server_args.py +34 -5
  68. sglang/srt/utils.py +40 -56
  69. sglang/test/runners.py +2 -1
  70. sglang/test/test_utils.py +73 -25
  71. sglang/utils.py +62 -1
  72. sglang/version.py +1 -1
  73. sglang-0.3.5.dist-info/METADATA +344 -0
  74. {sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/RECORD +77 -73
  75. {sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/WHEEL +1 -1
  76. sglang-0.3.4.post2.dist-info/METADATA +0 -899
  77. {sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/LICENSE +0 -0
  78. {sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/top_level.txt +0 -0
@@ -24,10 +24,6 @@ from torch import nn
24
24
  from transformers import LlamaConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
26
  from vllm.model_executor.layers.rotary_embedding import get_rope
27
- from vllm.model_executor.layers.vocab_parallel_embedding import (
28
- ParallelLMHead,
29
- VocabParallelEmbedding,
30
- )
31
27
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
32
28
 
33
29
  from sglang.srt.layers.activation import SiluAndMul
@@ -38,9 +34,14 @@ from sglang.srt.layers.linear import (
38
34
  RowParallelLinear,
39
35
  )
40
36
  from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
37
+ from sglang.srt.layers.pooler import Pooler, PoolingType
41
38
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
39
  from sglang.srt.layers.radix_attention import RadixAttention
43
40
  from sglang.srt.layers.torchao_utils import apply_torchao_config_
41
+ from sglang.srt.layers.vocab_parallel_embedding import (
42
+ ParallelLMHead,
43
+ VocabParallelEmbedding,
44
+ )
44
45
  from sglang.srt.managers.schedule_batch import global_server_args_dict
45
46
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
46
47
 
@@ -303,6 +304,7 @@ class LlamaForCausalLM(nn.Module):
303
304
  self.model = LlamaModel(config, quant_config=quant_config)
304
305
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
305
306
  self.logits_processor = LogitsProcessor(config)
307
+ self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
306
308
 
307
309
  @torch.no_grad()
308
310
  def forward(
@@ -311,11 +313,15 @@ class LlamaForCausalLM(nn.Module):
311
313
  positions: torch.Tensor,
312
314
  forward_batch: ForwardBatch,
313
315
  input_embeds: torch.Tensor = None,
316
+ get_embedding: bool = False,
314
317
  ) -> LogitsProcessorOutput:
315
318
  hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
316
- return self.logits_processor(
317
- input_ids, hidden_states, self.lm_head.weight, forward_batch
318
- )
319
+ if not get_embedding:
320
+ return self.logits_processor(
321
+ input_ids, hidden_states, self.lm_head.weight, forward_batch
322
+ )
323
+ else:
324
+ return self.pooler(hidden_states, forward_batch)
319
325
 
320
326
  def get_hidden_dim(self, module_name):
321
327
  # return input_dim, output_dim
@@ -409,11 +415,13 @@ class LlamaForCausalLM(nn.Module):
409
415
  if (
410
416
  hasattr(self.config, "tie_word_embeddings")
411
417
  and self.config.tie_word_embeddings
418
+ and "lm_head.weight" in params_dict
412
419
  ):
413
420
  # Tie output embedding layer to input embedding layer, to solve issues where lm_head.weight is missing
414
421
  param = self.lm_head.weight
415
422
  weight_loader = getattr(param, "weight_loader", default_weight_loader)
416
423
  weight_loader(param, self.model.embed_tokens.weight)
424
+
417
425
  apply_torchao_config_(self, params_dict, set(["proj.weight"]))
418
426
 
419
427
 
@@ -36,9 +36,7 @@ class LlamaEmbeddingModel(nn.Module):
36
36
  hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
37
37
  return self.pooler(hidden_states, forward_batch)
38
38
 
39
- def load_weights(
40
- self, weights: Iterable[Tuple[str, torch.Tensor]], name=None, loaded_weight=None
41
- ):
39
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
42
40
  stacked_params_mapping = [
43
41
  # (param_name, shard_name, shard_id)
44
42
  ("qkv_proj", "q_proj", "q"),
@@ -49,7 +47,7 @@ class LlamaEmbeddingModel(nn.Module):
49
47
  ]
50
48
  params_dict = dict(self.model.named_parameters())
51
49
 
52
- def load_weights_per_param(name, loaded_weight):
50
+ for name, loaded_weight in weights:
53
51
  if "rotary_emb.inv_freq" in name or "projector" in name:
54
52
  return
55
53
  if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
@@ -78,12 +76,6 @@ class LlamaEmbeddingModel(nn.Module):
78
76
  weight_loader = getattr(param, "weight_loader", default_weight_loader)
79
77
  weight_loader(param, loaded_weight)
80
78
 
81
- if name is None or loaded_weight is None:
82
- for name, loaded_weight in weights:
83
- load_weights_per_param(name, loaded_weight)
84
- else:
85
- load_weights_per_param(name, loaded_weight)
86
-
87
79
 
88
80
  class MistralModel(LlamaEmbeddingModel):
89
81
  pass
@@ -52,7 +52,12 @@ class LlamaForSequenceClassification(nn.Module):
52
52
  positions: torch.Tensor,
53
53
  forward_batch: ForwardBatch,
54
54
  input_embeds: torch.Tensor = None,
55
+ get_embedding: bool = True,
55
56
  ) -> EmbeddingPoolerOutput:
57
+ assert (
58
+ get_embedding
59
+ ), "LlamaForSequenceClassification is only used for embedding"
60
+
56
61
  hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
57
62
  scores = self.score(hidden_states)
58
63
 
@@ -22,10 +22,6 @@ import torch
22
22
  from torch import nn
23
23
  from vllm.distributed import get_tensor_model_parallel_world_size
24
24
  from vllm.model_executor.layers.rotary_embedding import get_rope
25
- from vllm.model_executor.layers.vocab_parallel_embedding import (
26
- ParallelLMHead,
27
- VocabParallelEmbedding,
28
- )
29
25
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
30
26
 
31
27
  from sglang.srt.layers.activation import SiluAndMul
@@ -38,6 +34,10 @@ from sglang.srt.layers.linear import (
38
34
  from sglang.srt.layers.logits_processor import LogitsProcessor
39
35
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
40
36
  from sglang.srt.layers.radix_attention import RadixAttention
37
+ from sglang.srt.layers.vocab_parallel_embedding import (
38
+ ParallelLMHead,
39
+ VocabParallelEmbedding,
40
+ )
41
41
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
42
42
 
43
43
 
@@ -29,10 +29,6 @@ from vllm.model_executor.layers.linear import (
29
29
  RowParallelLinear,
30
30
  )
31
31
  from vllm.model_executor.layers.rotary_embedding import get_rope
32
- from vllm.model_executor.layers.vocab_parallel_embedding import (
33
- ParallelLMHead,
34
- VocabParallelEmbedding,
35
- )
36
32
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
37
33
 
38
34
  from sglang.srt.layers.activation import SiluAndMul
@@ -40,6 +36,10 @@ from sglang.srt.layers.layernorm import RMSNorm
40
36
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
37
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
38
  from sglang.srt.layers.radix_attention import RadixAttention
39
+ from sglang.srt.layers.vocab_parallel_embedding import (
40
+ ParallelLMHead,
41
+ VocabParallelEmbedding,
42
+ )
43
43
  from sglang.srt.managers.schedule_batch import global_server_args_dict
44
44
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
45
45
  from sglang.srt.utils import is_flashinfer_available
@@ -24,11 +24,6 @@ from transformers import MixtralConfig
24
24
  from vllm.distributed import get_tensor_model_parallel_world_size
25
25
  from vllm.model_executor.layers.fused_moe import FusedMoE
26
26
  from vllm.model_executor.layers.rotary_embedding import get_rope
27
- from vllm.model_executor.layers.vocab_parallel_embedding import (
28
- DEFAULT_VOCAB_PADDING_SIZE,
29
- ParallelLMHead,
30
- VocabParallelEmbedding,
31
- )
32
27
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
33
28
 
34
29
  from sglang.srt.layers.layernorm import RMSNorm
@@ -41,6 +36,10 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
41
36
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
37
  from sglang.srt.layers.radix_attention import RadixAttention
43
38
  from sglang.srt.layers.torchao_utils import apply_torchao_config_
39
+ from sglang.srt.layers.vocab_parallel_embedding import (
40
+ ParallelLMHead,
41
+ VocabParallelEmbedding,
42
+ )
44
43
  from sglang.srt.managers.schedule_batch import global_server_args_dict
45
44
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
46
45
 
@@ -369,6 +368,9 @@ class MixtralForCausalLM(nn.Module):
369
368
  # Skip loading extra bias for GPTQ models.
370
369
  if name.endswith(".bias") and name not in params_dict:
371
370
  continue
371
+ # Skip loading kv_scale from ckpts towards new design.
372
+ if name.endswith(".kv_scale") and name not in params_dict:
373
+ continue
372
374
  if name is None:
373
375
  continue
374
376
 
@@ -29,10 +29,6 @@ from vllm.distributed import (
29
29
  tensor_model_parallel_all_reduce,
30
30
  )
31
31
  from vllm.model_executor.layers.rotary_embedding import get_rope
32
- from vllm.model_executor.layers.vocab_parallel_embedding import (
33
- ParallelLMHead,
34
- VocabParallelEmbedding,
35
- )
36
32
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
37
33
 
38
34
  from sglang.srt.layers.layernorm import RMSNorm
@@ -44,6 +40,10 @@ from sglang.srt.layers.linear import (
44
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
45
41
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
46
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
+ from sglang.srt.layers.vocab_parallel_embedding import (
44
+ ParallelLMHead,
45
+ VocabParallelEmbedding,
46
+ )
47
47
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
48
48
 
49
49
 
@@ -15,11 +15,6 @@ from transformers.models.mllama.modeling_mllama import (
15
15
  _prepare_aspect_ratio_attention_mask,
16
16
  )
17
17
  from vllm.distributed import get_tensor_model_parallel_world_size
18
- from vllm.model_executor.layers.vocab_parallel_embedding import (
19
- DEFAULT_VOCAB_PADDING_SIZE,
20
- ParallelLMHead,
21
- VocabParallelEmbedding,
22
- )
23
18
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
24
19
 
25
20
  from sglang.srt.layers.activation import get_act_fn
@@ -32,6 +27,11 @@ from sglang.srt.layers.linear import (
32
27
  from sglang.srt.layers.logits_processor import LogitsProcessor
33
28
  from sglang.srt.layers.quantization import QuantizationConfig
34
29
  from sglang.srt.layers.radix_attention import RadixAttention
30
+ from sglang.srt.layers.vocab_parallel_embedding import (
31
+ DEFAULT_VOCAB_PADDING_SIZE,
32
+ ParallelLMHead,
33
+ VocabParallelEmbedding,
34
+ )
35
35
  from sglang.srt.managers.schedule_batch import ImageInputs
36
36
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
37
37
  from sglang.srt.models.llama import LlamaDecoderLayer, LlamaMLP
sglang/srt/models/olmo.py CHANGED
@@ -23,10 +23,6 @@ from torch import nn
23
23
  from transformers import OlmoConfig
24
24
  from vllm.distributed import get_tensor_model_parallel_world_size
25
25
  from vllm.model_executor.layers.rotary_embedding import get_rope
26
- from vllm.model_executor.layers.vocab_parallel_embedding import (
27
- ParallelLMHead,
28
- VocabParallelEmbedding,
29
- )
30
26
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
31
27
 
32
28
  from sglang.srt.layers.activation import SiluAndMul
@@ -38,6 +34,10 @@ from sglang.srt.layers.linear import (
38
34
  from sglang.srt.layers.logits_processor import LogitsProcessor
39
35
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
40
36
  from sglang.srt.layers.radix_attention import RadixAttention
37
+ from sglang.srt.layers.vocab_parallel_embedding import (
38
+ ParallelLMHead,
39
+ VocabParallelEmbedding,
40
+ )
41
41
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
42
42
 
43
43
 
@@ -35,10 +35,6 @@ from vllm.model_executor.layers.linear import (
35
35
  RowParallelLinear,
36
36
  )
37
37
  from vllm.model_executor.layers.rotary_embedding import get_rope
38
- from vllm.model_executor.layers.vocab_parallel_embedding import (
39
- ParallelLMHead,
40
- VocabParallelEmbedding,
41
- )
42
38
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
43
39
  from vllm.utils import print_warning_once
44
40
 
@@ -47,6 +43,10 @@ from sglang.srt.layers.layernorm import RMSNorm
47
43
  from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
48
44
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
49
45
  from sglang.srt.layers.radix_attention import RadixAttention
46
+ from sglang.srt.layers.vocab_parallel_embedding import (
47
+ ParallelLMHead,
48
+ VocabParallelEmbedding,
49
+ )
50
50
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
51
51
 
52
52
 
sglang/srt/models/qwen.py CHANGED
@@ -22,10 +22,6 @@ from torch import nn
22
22
  from transformers import PretrainedConfig
23
23
  from vllm.distributed import get_tensor_model_parallel_world_size
24
24
  from vllm.model_executor.layers.rotary_embedding import get_rope
25
- from vllm.model_executor.layers.vocab_parallel_embedding import (
26
- ParallelLMHead,
27
- VocabParallelEmbedding,
28
- )
29
25
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
30
26
 
31
27
  from sglang.srt.layers.activation import SiluAndMul
@@ -38,6 +34,10 @@ from sglang.srt.layers.linear import (
38
34
  from sglang.srt.layers.logits_processor import LogitsProcessor
39
35
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
40
36
  from sglang.srt.layers.radix_attention import RadixAttention
37
+ from sglang.srt.layers.vocab_parallel_embedding import (
38
+ ParallelLMHead,
39
+ VocabParallelEmbedding,
40
+ )
41
41
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
42
42
 
43
43
 
@@ -22,10 +22,6 @@ import torch
22
22
  from torch import nn
23
23
  from vllm.distributed import get_tensor_model_parallel_world_size
24
24
  from vllm.model_executor.layers.rotary_embedding import get_rope
25
- from vllm.model_executor.layers.vocab_parallel_embedding import (
26
- ParallelLMHead,
27
- VocabParallelEmbedding,
28
- )
29
25
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
30
26
 
31
27
  from sglang.srt.layers.activation import SiluAndMul
@@ -39,6 +35,10 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
39
35
  from sglang.srt.layers.pooler import Pooler, PoolingType
40
36
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
41
37
  from sglang.srt.layers.radix_attention import RadixAttention
38
+ from sglang.srt.layers.vocab_parallel_embedding import (
39
+ ParallelLMHead,
40
+ VocabParallelEmbedding,
41
+ )
42
42
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
43
43
 
44
44
  Qwen2Config = None
@@ -29,10 +29,6 @@ from vllm.distributed import (
29
29
  )
30
30
  from vllm.model_executor.layers.fused_moe import FusedMoE
31
31
  from vllm.model_executor.layers.rotary_embedding import get_rope
32
- from vllm.model_executor.layers.vocab_parallel_embedding import (
33
- ParallelLMHead,
34
- VocabParallelEmbedding,
35
- )
36
32
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
37
33
 
38
34
  from sglang.srt.layers.activation import SiluAndMul
@@ -47,6 +43,10 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
47
43
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
48
44
  from sglang.srt.layers.radix_attention import RadixAttention
49
45
  from sglang.srt.layers.torchao_utils import apply_torchao_config_
46
+ from sglang.srt.layers.vocab_parallel_embedding import (
47
+ ParallelLMHead,
48
+ VocabParallelEmbedding,
49
+ )
50
50
  from sglang.srt.managers.schedule_batch import global_server_args_dict
51
51
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
52
52
 
@@ -23,7 +23,7 @@
23
23
  # limitations under the License.
24
24
  """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
25
25
  from functools import lru_cache, partial
26
- from typing import Iterable, List, Mapping, Optional, Tuple, Type, TypedDict, Union
26
+ from typing import Iterable, List, Optional, Tuple, Type, TypedDict
27
27
 
28
28
  import numpy as np
29
29
  import torch
@@ -35,9 +35,7 @@ from vllm.distributed import parallel_state
35
35
  from vllm.distributed import utils as dist_utils
36
36
  from vllm.logger import init_logger
37
37
  from vllm.model_executor.layers.activation import QuickGELU
38
- from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
39
38
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
40
- from vllm.model_executor.models.interfaces import SupportsMultiModal
41
39
 
42
40
  from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig
43
41
  from sglang.srt.hf_transformers_utils import get_processor
@@ -47,6 +45,7 @@ from sglang.srt.layers.attention.triton_ops.prefill_attention import (
47
45
  from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
48
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
49
47
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
48
+ from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
50
49
  from sglang.srt.managers.schedule_batch import ImageInputs
51
50
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
52
51
  from sglang.srt.models.qwen2 import Qwen2Model
@@ -486,7 +485,7 @@ class Qwen2VisionTransformer(nn.Module):
486
485
  cached_get_processor = lru_cache(get_processor)
487
486
 
488
487
 
489
- class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal):
488
+ class Qwen2VLForConditionalGeneration(nn.Module):
490
489
  def calculate_num_image_tokens(self, image_grid_thw: Tuple[int, int, int]):
491
490
  processor = cached_get_processor(self.config._name_or_path)
492
491
  grid_t, grid_h, grid_w = image_grid_thw
@@ -536,15 +535,12 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal):
536
535
  def __init__(
537
536
  self,
538
537
  config: Qwen2VLConfig,
539
- multimodal_config: MultiModalConfig,
540
538
  cache_config: Optional[CacheConfig] = None,
541
539
  quant_config: Optional[QuantizationConfig] = None,
542
540
  ) -> None:
543
541
  super().__init__()
544
542
 
545
543
  self.config = config
546
- self.multimodal_config = multimodal_config
547
-
548
544
  self.visual = Qwen2VisionTransformer(
549
545
  config.vision_config,
550
546
  norm_eps=getattr(config, "rms_norm_eps", 1e-6),
@@ -622,7 +618,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal):
622
618
  extend_start_loc_cpu = forward_batch.extend_start_loc.cpu().numpy()
623
619
  prefix_lens_cpu = forward_batch.extend_prefix_lens.cpu().numpy()
624
620
  for i, image in enumerate(forward_batch.image_inputs):
625
- if image == None:
621
+ if image is None:
626
622
  continue
627
623
  start_idx = extend_start_loc_cpu[i]
628
624
  prefix_len = prefix_lens_cpu[i]
@@ -24,10 +24,6 @@ from torch import nn
24
24
  from transformers import PretrainedConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
26
  from vllm.model_executor.layers.rotary_embedding import get_rope
27
- from vllm.model_executor.layers.vocab_parallel_embedding import (
28
- ParallelLMHead,
29
- VocabParallelEmbedding,
30
- )
31
27
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
32
28
 
33
29
  from sglang.srt.layers.activation import SiluAndMul
@@ -39,6 +35,10 @@ from sglang.srt.layers.linear import (
39
35
  from sglang.srt.layers.logits_processor import LogitsProcessor
40
36
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
41
37
  from sglang.srt.layers.radix_attention import RadixAttention
38
+ from sglang.srt.layers.vocab_parallel_embedding import (
39
+ ParallelLMHead,
40
+ VocabParallelEmbedding,
41
+ )
42
42
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
43
43
 
44
44
 
@@ -26,10 +26,6 @@ from torch.nn.parameter import Parameter
26
26
  from transformers import LlamaConfig
27
27
  from vllm.distributed import get_tensor_model_parallel_world_size
28
28
  from vllm.model_executor.layers.rotary_embedding import get_rope
29
- from vllm.model_executor.layers.vocab_parallel_embedding import (
30
- ParallelLMHead,
31
- VocabParallelEmbedding,
32
- )
33
29
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
34
30
 
35
31
  from sglang.srt.layers.activation import SiluAndMul
@@ -38,6 +34,10 @@ from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorO
38
34
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
39
35
  from sglang.srt.layers.radix_attention import RadixAttention
40
36
  from sglang.srt.layers.torchao_utils import apply_torchao_config_
37
+ from sglang.srt.layers.vocab_parallel_embedding import (
38
+ ParallelLMHead,
39
+ VocabParallelEmbedding,
40
+ )
41
41
  from sglang.srt.managers.schedule_batch import global_server_args_dict
42
42
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
43
43
 
@@ -31,15 +31,15 @@ from vllm.model_executor.layers.linear import (
31
31
  RowParallelLinear,
32
32
  )
33
33
  from vllm.model_executor.layers.rotary_embedding import get_rope
34
- from vllm.model_executor.layers.vocab_parallel_embedding import (
35
- ParallelLMHead,
36
- VocabParallelEmbedding,
37
- )
38
34
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
39
35
 
40
36
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
37
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
38
  from sglang.srt.layers.radix_attention import RadixAttention
39
+ from sglang.srt.layers.vocab_parallel_embedding import (
40
+ ParallelLMHead,
41
+ VocabParallelEmbedding,
42
+ )
43
43
  from sglang.srt.model_executor.model_runner import ForwardBatch
44
44
 
45
45
 
@@ -34,15 +34,15 @@ from vllm.model_executor.layers.linear import (
34
34
  RowParallelLinear,
35
35
  )
36
36
  from vllm.model_executor.layers.rotary_embedding import get_rope
37
- from vllm.model_executor.layers.vocab_parallel_embedding import (
38
- ParallelLMHead,
39
- VocabParallelEmbedding,
40
- )
41
37
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
42
38
 
43
39
  from sglang.srt.layers.logits_processor import LogitsProcessor
44
40
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
45
41
  from sglang.srt.layers.radix_attention import RadixAttention
42
+ from sglang.srt.layers.vocab_parallel_embedding import (
43
+ ParallelLMHead,
44
+ VocabParallelEmbedding,
45
+ )
46
46
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
47
47
 
48
48