sglang 0.3.4.post2__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +1 -1
- sglang/bench_latency.py +3 -3
- sglang/bench_server_latency.py +2 -3
- sglang/bench_serving.py +92 -0
- sglang/global_config.py +9 -3
- sglang/lang/chat_template.py +50 -25
- sglang/lang/interpreter.py +9 -1
- sglang/lang/ir.py +11 -2
- sglang/launch_server.py +1 -1
- sglang/srt/configs/model_config.py +51 -13
- sglang/srt/constrained/__init__.py +18 -0
- sglang/srt/constrained/bnf_cache.py +61 -0
- sglang/srt/constrained/grammar.py +190 -0
- sglang/srt/hf_transformers_utils.py +6 -5
- sglang/srt/layers/attention/triton_ops/decode_attention.py +110 -30
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +1 -1
- sglang/srt/layers/fused_moe/fused_moe.py +4 -3
- sglang/srt/layers/fused_moe/layer.py +28 -0
- sglang/srt/layers/quantization/base_config.py +16 -1
- sglang/srt/layers/vocab_parallel_embedding.py +486 -0
- sglang/srt/managers/data_parallel_controller.py +7 -6
- sglang/srt/managers/detokenizer_manager.py +9 -11
- sglang/srt/managers/image_processor.py +4 -3
- sglang/srt/managers/io_struct.py +70 -78
- sglang/srt/managers/schedule_batch.py +33 -49
- sglang/srt/managers/schedule_policy.py +24 -13
- sglang/srt/managers/scheduler.py +137 -80
- sglang/srt/managers/tokenizer_manager.py +224 -336
- sglang/srt/managers/tp_worker.py +5 -5
- sglang/srt/mem_cache/flush_cache.py +1 -1
- sglang/srt/model_executor/cuda_graph_runner.py +7 -4
- sglang/srt/model_executor/model_runner.py +8 -17
- sglang/srt/models/baichuan.py +4 -4
- sglang/srt/models/chatglm.py +4 -4
- sglang/srt/models/commandr.py +1 -1
- sglang/srt/models/dbrx.py +5 -5
- sglang/srt/models/deepseek.py +4 -4
- sglang/srt/models/deepseek_v2.py +4 -4
- sglang/srt/models/exaone.py +4 -4
- sglang/srt/models/gemma.py +1 -1
- sglang/srt/models/gemma2.py +1 -1
- sglang/srt/models/gpt2.py +287 -0
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/grok.py +4 -4
- sglang/srt/models/internlm2.py +4 -4
- sglang/srt/models/llama.py +15 -7
- sglang/srt/models/llama_embedding.py +2 -10
- sglang/srt/models/llama_reward.py +5 -0
- sglang/srt/models/minicpm.py +4 -4
- sglang/srt/models/minicpm3.py +4 -4
- sglang/srt/models/mixtral.py +7 -5
- sglang/srt/models/mixtral_quant.py +4 -4
- sglang/srt/models/mllama.py +5 -5
- sglang/srt/models/olmo.py +4 -4
- sglang/srt/models/olmoe.py +4 -4
- sglang/srt/models/qwen.py +4 -4
- sglang/srt/models/qwen2.py +4 -4
- sglang/srt/models/qwen2_moe.py +4 -4
- sglang/srt/models/qwen2_vl.py +4 -8
- sglang/srt/models/stablelm.py +4 -4
- sglang/srt/models/torch_native_llama.py +4 -4
- sglang/srt/models/xverse.py +4 -4
- sglang/srt/models/xverse_moe.py +4 -4
- sglang/srt/openai_api/adapter.py +52 -66
- sglang/srt/sampling/sampling_batch_info.py +7 -13
- sglang/srt/server.py +31 -35
- sglang/srt/server_args.py +34 -5
- sglang/srt/utils.py +40 -56
- sglang/test/runners.py +2 -1
- sglang/test/test_utils.py +73 -25
- sglang/utils.py +62 -1
- sglang/version.py +1 -1
- sglang-0.3.5.dist-info/METADATA +344 -0
- {sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/RECORD +77 -73
- {sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/WHEEL +1 -1
- sglang-0.3.4.post2.dist-info/METADATA +0 -899
- {sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/LICENSE +0 -0
- {sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/top_level.txt +0 -0
sglang/srt/models/llama.py
CHANGED
@@ -24,10 +24,6 @@ from torch import nn
|
|
24
24
|
from transformers import LlamaConfig
|
25
25
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
26
26
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
27
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
28
|
-
ParallelLMHead,
|
29
|
-
VocabParallelEmbedding,
|
30
|
-
)
|
31
27
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
32
28
|
|
33
29
|
from sglang.srt.layers.activation import SiluAndMul
|
@@ -38,9 +34,14 @@ from sglang.srt.layers.linear import (
|
|
38
34
|
RowParallelLinear,
|
39
35
|
)
|
40
36
|
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
37
|
+
from sglang.srt.layers.pooler import Pooler, PoolingType
|
41
38
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
39
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
40
|
from sglang.srt.layers.torchao_utils import apply_torchao_config_
|
41
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
42
|
+
ParallelLMHead,
|
43
|
+
VocabParallelEmbedding,
|
44
|
+
)
|
44
45
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
45
46
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
46
47
|
|
@@ -303,6 +304,7 @@ class LlamaForCausalLM(nn.Module):
|
|
303
304
|
self.model = LlamaModel(config, quant_config=quant_config)
|
304
305
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
305
306
|
self.logits_processor = LogitsProcessor(config)
|
307
|
+
self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
|
306
308
|
|
307
309
|
@torch.no_grad()
|
308
310
|
def forward(
|
@@ -311,11 +313,15 @@ class LlamaForCausalLM(nn.Module):
|
|
311
313
|
positions: torch.Tensor,
|
312
314
|
forward_batch: ForwardBatch,
|
313
315
|
input_embeds: torch.Tensor = None,
|
316
|
+
get_embedding: bool = False,
|
314
317
|
) -> LogitsProcessorOutput:
|
315
318
|
hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
+
if not get_embedding:
|
320
|
+
return self.logits_processor(
|
321
|
+
input_ids, hidden_states, self.lm_head.weight, forward_batch
|
322
|
+
)
|
323
|
+
else:
|
324
|
+
return self.pooler(hidden_states, forward_batch)
|
319
325
|
|
320
326
|
def get_hidden_dim(self, module_name):
|
321
327
|
# return input_dim, output_dim
|
@@ -409,11 +415,13 @@ class LlamaForCausalLM(nn.Module):
|
|
409
415
|
if (
|
410
416
|
hasattr(self.config, "tie_word_embeddings")
|
411
417
|
and self.config.tie_word_embeddings
|
418
|
+
and "lm_head.weight" in params_dict
|
412
419
|
):
|
413
420
|
# Tie output embedding layer to input embedding layer, to solve issues where lm_head.weight is missing
|
414
421
|
param = self.lm_head.weight
|
415
422
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
416
423
|
weight_loader(param, self.model.embed_tokens.weight)
|
424
|
+
|
417
425
|
apply_torchao_config_(self, params_dict, set(["proj.weight"]))
|
418
426
|
|
419
427
|
|
@@ -36,9 +36,7 @@ class LlamaEmbeddingModel(nn.Module):
|
|
36
36
|
hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
|
37
37
|
return self.pooler(hidden_states, forward_batch)
|
38
38
|
|
39
|
-
def load_weights(
|
40
|
-
self, weights: Iterable[Tuple[str, torch.Tensor]], name=None, loaded_weight=None
|
41
|
-
):
|
39
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
42
40
|
stacked_params_mapping = [
|
43
41
|
# (param_name, shard_name, shard_id)
|
44
42
|
("qkv_proj", "q_proj", "q"),
|
@@ -49,7 +47,7 @@ class LlamaEmbeddingModel(nn.Module):
|
|
49
47
|
]
|
50
48
|
params_dict = dict(self.model.named_parameters())
|
51
49
|
|
52
|
-
|
50
|
+
for name, loaded_weight in weights:
|
53
51
|
if "rotary_emb.inv_freq" in name or "projector" in name:
|
54
52
|
return
|
55
53
|
if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
|
@@ -78,12 +76,6 @@ class LlamaEmbeddingModel(nn.Module):
|
|
78
76
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
79
77
|
weight_loader(param, loaded_weight)
|
80
78
|
|
81
|
-
if name is None or loaded_weight is None:
|
82
|
-
for name, loaded_weight in weights:
|
83
|
-
load_weights_per_param(name, loaded_weight)
|
84
|
-
else:
|
85
|
-
load_weights_per_param(name, loaded_weight)
|
86
|
-
|
87
79
|
|
88
80
|
class MistralModel(LlamaEmbeddingModel):
|
89
81
|
pass
|
@@ -52,7 +52,12 @@ class LlamaForSequenceClassification(nn.Module):
|
|
52
52
|
positions: torch.Tensor,
|
53
53
|
forward_batch: ForwardBatch,
|
54
54
|
input_embeds: torch.Tensor = None,
|
55
|
+
get_embedding: bool = True,
|
55
56
|
) -> EmbeddingPoolerOutput:
|
57
|
+
assert (
|
58
|
+
get_embedding
|
59
|
+
), "LlamaForSequenceClassification is only used for embedding"
|
60
|
+
|
56
61
|
hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
|
57
62
|
scores = self.score(hidden_states)
|
58
63
|
|
sglang/srt/models/minicpm.py
CHANGED
@@ -22,10 +22,6 @@ import torch
|
|
22
22
|
from torch import nn
|
23
23
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
24
24
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
25
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
26
|
-
ParallelLMHead,
|
27
|
-
VocabParallelEmbedding,
|
28
|
-
)
|
29
25
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
30
26
|
|
31
27
|
from sglang.srt.layers.activation import SiluAndMul
|
@@ -38,6 +34,10 @@ from sglang.srt.layers.linear import (
|
|
38
34
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
39
35
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
40
36
|
from sglang.srt.layers.radix_attention import RadixAttention
|
37
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
38
|
+
ParallelLMHead,
|
39
|
+
VocabParallelEmbedding,
|
40
|
+
)
|
41
41
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
42
42
|
|
43
43
|
|
sglang/srt/models/minicpm3.py
CHANGED
@@ -29,10 +29,6 @@ from vllm.model_executor.layers.linear import (
|
|
29
29
|
RowParallelLinear,
|
30
30
|
)
|
31
31
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
32
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
33
|
-
ParallelLMHead,
|
34
|
-
VocabParallelEmbedding,
|
35
|
-
)
|
36
32
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
37
33
|
|
38
34
|
from sglang.srt.layers.activation import SiluAndMul
|
@@ -40,6 +36,10 @@ from sglang.srt.layers.layernorm import RMSNorm
|
|
40
36
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
37
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
38
|
from sglang.srt.layers.radix_attention import RadixAttention
|
39
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
40
|
+
ParallelLMHead,
|
41
|
+
VocabParallelEmbedding,
|
42
|
+
)
|
43
43
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
44
44
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
45
45
|
from sglang.srt.utils import is_flashinfer_available
|
sglang/srt/models/mixtral.py
CHANGED
@@ -24,11 +24,6 @@ from transformers import MixtralConfig
|
|
24
24
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
25
25
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
26
26
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
27
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
28
|
-
DEFAULT_VOCAB_PADDING_SIZE,
|
29
|
-
ParallelLMHead,
|
30
|
-
VocabParallelEmbedding,
|
31
|
-
)
|
32
27
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
33
28
|
|
34
29
|
from sglang.srt.layers.layernorm import RMSNorm
|
@@ -41,6 +36,10 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
41
36
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
37
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
38
|
from sglang.srt.layers.torchao_utils import apply_torchao_config_
|
39
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
40
|
+
ParallelLMHead,
|
41
|
+
VocabParallelEmbedding,
|
42
|
+
)
|
44
43
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
45
44
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
46
45
|
|
@@ -369,6 +368,9 @@ class MixtralForCausalLM(nn.Module):
|
|
369
368
|
# Skip loading extra bias for GPTQ models.
|
370
369
|
if name.endswith(".bias") and name not in params_dict:
|
371
370
|
continue
|
371
|
+
# Skip loading kv_scale from ckpts towards new design.
|
372
|
+
if name.endswith(".kv_scale") and name not in params_dict:
|
373
|
+
continue
|
372
374
|
if name is None:
|
373
375
|
continue
|
374
376
|
|
@@ -29,10 +29,6 @@ from vllm.distributed import (
|
|
29
29
|
tensor_model_parallel_all_reduce,
|
30
30
|
)
|
31
31
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
32
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
33
|
-
ParallelLMHead,
|
34
|
-
VocabParallelEmbedding,
|
35
|
-
)
|
36
32
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
37
33
|
|
38
34
|
from sglang.srt.layers.layernorm import RMSNorm
|
@@ -44,6 +40,10 @@ from sglang.srt.layers.linear import (
|
|
44
40
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
45
41
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
46
42
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
44
|
+
ParallelLMHead,
|
45
|
+
VocabParallelEmbedding,
|
46
|
+
)
|
47
47
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
48
48
|
|
49
49
|
|
sglang/srt/models/mllama.py
CHANGED
@@ -15,11 +15,6 @@ from transformers.models.mllama.modeling_mllama import (
|
|
15
15
|
_prepare_aspect_ratio_attention_mask,
|
16
16
|
)
|
17
17
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
18
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
19
|
-
DEFAULT_VOCAB_PADDING_SIZE,
|
20
|
-
ParallelLMHead,
|
21
|
-
VocabParallelEmbedding,
|
22
|
-
)
|
23
18
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
24
19
|
|
25
20
|
from sglang.srt.layers.activation import get_act_fn
|
@@ -32,6 +27,11 @@ from sglang.srt.layers.linear import (
|
|
32
27
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
33
28
|
from sglang.srt.layers.quantization import QuantizationConfig
|
34
29
|
from sglang.srt.layers.radix_attention import RadixAttention
|
30
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
31
|
+
DEFAULT_VOCAB_PADDING_SIZE,
|
32
|
+
ParallelLMHead,
|
33
|
+
VocabParallelEmbedding,
|
34
|
+
)
|
35
35
|
from sglang.srt.managers.schedule_batch import ImageInputs
|
36
36
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
37
37
|
from sglang.srt.models.llama import LlamaDecoderLayer, LlamaMLP
|
sglang/srt/models/olmo.py
CHANGED
@@ -23,10 +23,6 @@ from torch import nn
|
|
23
23
|
from transformers import OlmoConfig
|
24
24
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
25
25
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
26
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
27
|
-
ParallelLMHead,
|
28
|
-
VocabParallelEmbedding,
|
29
|
-
)
|
30
26
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
31
27
|
|
32
28
|
from sglang.srt.layers.activation import SiluAndMul
|
@@ -38,6 +34,10 @@ from sglang.srt.layers.linear import (
|
|
38
34
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
39
35
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
40
36
|
from sglang.srt.layers.radix_attention import RadixAttention
|
37
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
38
|
+
ParallelLMHead,
|
39
|
+
VocabParallelEmbedding,
|
40
|
+
)
|
41
41
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
42
42
|
|
43
43
|
|
sglang/srt/models/olmoe.py
CHANGED
@@ -35,10 +35,6 @@ from vllm.model_executor.layers.linear import (
|
|
35
35
|
RowParallelLinear,
|
36
36
|
)
|
37
37
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
38
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
39
|
-
ParallelLMHead,
|
40
|
-
VocabParallelEmbedding,
|
41
|
-
)
|
42
38
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
43
39
|
from vllm.utils import print_warning_once
|
44
40
|
|
@@ -47,6 +43,10 @@ from sglang.srt.layers.layernorm import RMSNorm
|
|
47
43
|
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
48
44
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
49
45
|
from sglang.srt.layers.radix_attention import RadixAttention
|
46
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
47
|
+
ParallelLMHead,
|
48
|
+
VocabParallelEmbedding,
|
49
|
+
)
|
50
50
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
51
51
|
|
52
52
|
|
sglang/srt/models/qwen.py
CHANGED
@@ -22,10 +22,6 @@ from torch import nn
|
|
22
22
|
from transformers import PretrainedConfig
|
23
23
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
24
24
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
25
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
26
|
-
ParallelLMHead,
|
27
|
-
VocabParallelEmbedding,
|
28
|
-
)
|
29
25
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
30
26
|
|
31
27
|
from sglang.srt.layers.activation import SiluAndMul
|
@@ -38,6 +34,10 @@ from sglang.srt.layers.linear import (
|
|
38
34
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
39
35
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
40
36
|
from sglang.srt.layers.radix_attention import RadixAttention
|
37
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
38
|
+
ParallelLMHead,
|
39
|
+
VocabParallelEmbedding,
|
40
|
+
)
|
41
41
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
42
42
|
|
43
43
|
|
sglang/srt/models/qwen2.py
CHANGED
@@ -22,10 +22,6 @@ import torch
|
|
22
22
|
from torch import nn
|
23
23
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
24
24
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
25
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
26
|
-
ParallelLMHead,
|
27
|
-
VocabParallelEmbedding,
|
28
|
-
)
|
29
25
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
30
26
|
|
31
27
|
from sglang.srt.layers.activation import SiluAndMul
|
@@ -39,6 +35,10 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
39
35
|
from sglang.srt.layers.pooler import Pooler, PoolingType
|
40
36
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
41
37
|
from sglang.srt.layers.radix_attention import RadixAttention
|
38
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
39
|
+
ParallelLMHead,
|
40
|
+
VocabParallelEmbedding,
|
41
|
+
)
|
42
42
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
43
43
|
|
44
44
|
Qwen2Config = None
|
sglang/srt/models/qwen2_moe.py
CHANGED
@@ -29,10 +29,6 @@ from vllm.distributed import (
|
|
29
29
|
)
|
30
30
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
31
31
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
32
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
33
|
-
ParallelLMHead,
|
34
|
-
VocabParallelEmbedding,
|
35
|
-
)
|
36
32
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
37
33
|
|
38
34
|
from sglang.srt.layers.activation import SiluAndMul
|
@@ -47,6 +43,10 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
47
43
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
48
44
|
from sglang.srt.layers.radix_attention import RadixAttention
|
49
45
|
from sglang.srt.layers.torchao_utils import apply_torchao_config_
|
46
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
47
|
+
ParallelLMHead,
|
48
|
+
VocabParallelEmbedding,
|
49
|
+
)
|
50
50
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
51
51
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
52
52
|
|
sglang/srt/models/qwen2_vl.py
CHANGED
@@ -23,7 +23,7 @@
|
|
23
23
|
# limitations under the License.
|
24
24
|
"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
|
25
25
|
from functools import lru_cache, partial
|
26
|
-
from typing import Iterable, List,
|
26
|
+
from typing import Iterable, List, Optional, Tuple, Type, TypedDict
|
27
27
|
|
28
28
|
import numpy as np
|
29
29
|
import torch
|
@@ -35,9 +35,7 @@ from vllm.distributed import parallel_state
|
|
35
35
|
from vllm.distributed import utils as dist_utils
|
36
36
|
from vllm.logger import init_logger
|
37
37
|
from vllm.model_executor.layers.activation import QuickGELU
|
38
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
39
38
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
40
|
-
from vllm.model_executor.models.interfaces import SupportsMultiModal
|
41
39
|
|
42
40
|
from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig
|
43
41
|
from sglang.srt.hf_transformers_utils import get_processor
|
@@ -47,6 +45,7 @@ from sglang.srt.layers.attention.triton_ops.prefill_attention import (
|
|
47
45
|
from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
|
48
46
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
49
47
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
48
|
+
from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
|
50
49
|
from sglang.srt.managers.schedule_batch import ImageInputs
|
51
50
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
52
51
|
from sglang.srt.models.qwen2 import Qwen2Model
|
@@ -486,7 +485,7 @@ class Qwen2VisionTransformer(nn.Module):
|
|
486
485
|
cached_get_processor = lru_cache(get_processor)
|
487
486
|
|
488
487
|
|
489
|
-
class Qwen2VLForConditionalGeneration(nn.Module
|
488
|
+
class Qwen2VLForConditionalGeneration(nn.Module):
|
490
489
|
def calculate_num_image_tokens(self, image_grid_thw: Tuple[int, int, int]):
|
491
490
|
processor = cached_get_processor(self.config._name_or_path)
|
492
491
|
grid_t, grid_h, grid_w = image_grid_thw
|
@@ -536,15 +535,12 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal):
|
|
536
535
|
def __init__(
|
537
536
|
self,
|
538
537
|
config: Qwen2VLConfig,
|
539
|
-
multimodal_config: MultiModalConfig,
|
540
538
|
cache_config: Optional[CacheConfig] = None,
|
541
539
|
quant_config: Optional[QuantizationConfig] = None,
|
542
540
|
) -> None:
|
543
541
|
super().__init__()
|
544
542
|
|
545
543
|
self.config = config
|
546
|
-
self.multimodal_config = multimodal_config
|
547
|
-
|
548
544
|
self.visual = Qwen2VisionTransformer(
|
549
545
|
config.vision_config,
|
550
546
|
norm_eps=getattr(config, "rms_norm_eps", 1e-6),
|
@@ -622,7 +618,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal):
|
|
622
618
|
extend_start_loc_cpu = forward_batch.extend_start_loc.cpu().numpy()
|
623
619
|
prefix_lens_cpu = forward_batch.extend_prefix_lens.cpu().numpy()
|
624
620
|
for i, image in enumerate(forward_batch.image_inputs):
|
625
|
-
if image
|
621
|
+
if image is None:
|
626
622
|
continue
|
627
623
|
start_idx = extend_start_loc_cpu[i]
|
628
624
|
prefix_len = prefix_lens_cpu[i]
|
sglang/srt/models/stablelm.py
CHANGED
@@ -24,10 +24,6 @@ from torch import nn
|
|
24
24
|
from transformers import PretrainedConfig
|
25
25
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
26
26
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
27
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
28
|
-
ParallelLMHead,
|
29
|
-
VocabParallelEmbedding,
|
30
|
-
)
|
31
27
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
32
28
|
|
33
29
|
from sglang.srt.layers.activation import SiluAndMul
|
@@ -39,6 +35,10 @@ from sglang.srt.layers.linear import (
|
|
39
35
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
40
36
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
41
37
|
from sglang.srt.layers.radix_attention import RadixAttention
|
38
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
39
|
+
ParallelLMHead,
|
40
|
+
VocabParallelEmbedding,
|
41
|
+
)
|
42
42
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
43
43
|
|
44
44
|
|
@@ -26,10 +26,6 @@ from torch.nn.parameter import Parameter
|
|
26
26
|
from transformers import LlamaConfig
|
27
27
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
28
28
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
29
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
30
|
-
ParallelLMHead,
|
31
|
-
VocabParallelEmbedding,
|
32
|
-
)
|
33
29
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
34
30
|
|
35
31
|
from sglang.srt.layers.activation import SiluAndMul
|
@@ -38,6 +34,10 @@ from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorO
|
|
38
34
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
39
35
|
from sglang.srt.layers.radix_attention import RadixAttention
|
40
36
|
from sglang.srt.layers.torchao_utils import apply_torchao_config_
|
37
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
38
|
+
ParallelLMHead,
|
39
|
+
VocabParallelEmbedding,
|
40
|
+
)
|
41
41
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
42
42
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
43
43
|
|
sglang/srt/models/xverse.py
CHANGED
@@ -31,15 +31,15 @@ from vllm.model_executor.layers.linear import (
|
|
31
31
|
RowParallelLinear,
|
32
32
|
)
|
33
33
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
34
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
35
|
-
ParallelLMHead,
|
36
|
-
VocabParallelEmbedding,
|
37
|
-
)
|
38
34
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
39
35
|
|
40
36
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
37
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
38
|
from sglang.srt.layers.radix_attention import RadixAttention
|
39
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
40
|
+
ParallelLMHead,
|
41
|
+
VocabParallelEmbedding,
|
42
|
+
)
|
43
43
|
from sglang.srt.model_executor.model_runner import ForwardBatch
|
44
44
|
|
45
45
|
|
sglang/srt/models/xverse_moe.py
CHANGED
@@ -34,15 +34,15 @@ from vllm.model_executor.layers.linear import (
|
|
34
34
|
RowParallelLinear,
|
35
35
|
)
|
36
36
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
37
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
38
|
-
ParallelLMHead,
|
39
|
-
VocabParallelEmbedding,
|
40
|
-
)
|
41
37
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
42
38
|
|
43
39
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
44
40
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
45
41
|
from sglang.srt.layers.radix_attention import RadixAttention
|
42
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
43
|
+
ParallelLMHead,
|
44
|
+
VocabParallelEmbedding,
|
45
|
+
)
|
46
46
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
47
47
|
|
48
48
|
|