sglang 0.3.1.post2__py3-none-any.whl → 0.3.1.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. sglang/bench_latency.py +8 -1
  2. sglang/srt/layers/activation.py +3 -2
  3. sglang/srt/layers/attention_backend.py +3 -1
  4. sglang/srt/layers/linear.py +1133 -0
  5. sglang/srt/layers/quantization/__init__.py +76 -0
  6. sglang/srt/layers/quantization/base_config.py +122 -0
  7. sglang/srt/models/baichuan.py +1 -1
  8. sglang/srt/models/chatglm.py +6 -6
  9. sglang/srt/models/commandr.py +7 -7
  10. sglang/srt/models/dbrx.py +7 -7
  11. sglang/srt/models/deepseek.py +7 -7
  12. sglang/srt/models/deepseek_v2.py +7 -7
  13. sglang/srt/models/exaone.py +6 -6
  14. sglang/srt/models/gemma.py +6 -6
  15. sglang/srt/models/gemma2.py +6 -6
  16. sglang/srt/models/gpt_bigcode.py +6 -6
  17. sglang/srt/models/grok.py +6 -6
  18. sglang/srt/models/internlm2.py +6 -6
  19. sglang/srt/models/llama.py +6 -6
  20. sglang/srt/models/llama_classification.py +1 -1
  21. sglang/srt/models/llava.py +1 -1
  22. sglang/srt/models/llavavid.py +1 -1
  23. sglang/srt/models/minicpm.py +6 -6
  24. sglang/srt/models/minicpm3.py +1 -1
  25. sglang/srt/models/mixtral.py +6 -6
  26. sglang/srt/models/mixtral_quant.py +6 -6
  27. sglang/srt/models/olmoe.py +1 -1
  28. sglang/srt/models/qwen.py +6 -6
  29. sglang/srt/models/qwen2.py +6 -6
  30. sglang/srt/models/qwen2_moe.py +7 -7
  31. sglang/srt/models/stablelm.py +6 -6
  32. sglang/srt/models/xverse.py +1 -1
  33. sglang/srt/models/xverse_moe.py +1 -1
  34. sglang/srt/models/yivl.py +1 -1
  35. sglang/srt/utils.py +21 -1
  36. sglang/test/test_utils.py +4 -2
  37. sglang/version.py +1 -1
  38. {sglang-0.3.1.post2.dist-info → sglang-0.3.1.post3.dist-info}/METADATA +3 -2
  39. {sglang-0.3.1.post2.dist-info → sglang-0.3.1.post3.dist-info}/RECORD +42 -39
  40. {sglang-0.3.1.post2.dist-info → sglang-0.3.1.post3.dist-info}/LICENSE +0 -0
  41. {sglang-0.3.1.post2.dist-info → sglang-0.3.1.post3.dist-info}/WHEEL +0 -0
  42. {sglang-0.3.1.post2.dist-info → sglang-0.3.1.post3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,76 @@
1
+ # Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/__init__.py
2
+
3
+ from typing import Dict, Type
4
+
5
+ from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
6
+ from vllm.model_executor.layers.quantization.awq import AWQConfig
7
+ from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
8
+ from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig
9
+ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
10
+ CompressedTensorsConfig,
11
+ )
12
+ from vllm.model_executor.layers.quantization.deepspeedfp import DeepSpeedFPConfig
13
+ from vllm.model_executor.layers.quantization.experts_int8 import ExpertsInt8Config
14
+ from vllm.model_executor.layers.quantization.fbgemm_fp8 import FBGEMMFp8Config
15
+ from vllm.model_executor.layers.quantization.fp8 import Fp8Config
16
+ from vllm.model_executor.layers.quantization.gguf import GGUFConfig
17
+ from vllm.model_executor.layers.quantization.gptq import GPTQConfig
18
+ from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinConfig
19
+ from vllm.model_executor.layers.quantization.gptq_marlin_24 import GPTQMarlin24Config
20
+ from vllm.model_executor.layers.quantization.marlin import MarlinConfig
21
+ from vllm.model_executor.layers.quantization.qqq import QQQConfig
22
+ from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
23
+ from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
24
+
25
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
26
+
27
+ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
28
+ "aqlm": AQLMConfig,
29
+ "awq": AWQConfig,
30
+ "deepspeedfp": DeepSpeedFPConfig,
31
+ "tpu_int8": Int8TpuConfig,
32
+ "fp8": Fp8Config,
33
+ "fbgemm_fp8": FBGEMMFp8Config,
34
+ # The order of gptq methods is important for config.py iteration over
35
+ # override_quantization_method(..)
36
+ "marlin": MarlinConfig,
37
+ "gguf": GGUFConfig,
38
+ "gptq_marlin_24": GPTQMarlin24Config,
39
+ "gptq_marlin": GPTQMarlinConfig,
40
+ "awq_marlin": AWQMarlinConfig,
41
+ "gptq": GPTQConfig,
42
+ "squeezellm": SqueezeLLMConfig,
43
+ "compressed-tensors": CompressedTensorsConfig,
44
+ "bitsandbytes": BitsAndBytesConfig,
45
+ "qqq": QQQConfig,
46
+ "experts_int8": ExpertsInt8Config,
47
+ }
48
+
49
+
50
+ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
51
+ if quantization not in QUANTIZATION_METHODS:
52
+ raise ValueError(f"Invalid quantization method: {quantization}")
53
+ return QUANTIZATION_METHODS[quantization]
54
+
55
+
56
+ __all__ = [
57
+ "QuantizationConfig",
58
+ "get_quantization_config",
59
+ "QUANTIZATION_METHODS",
60
+ ]
61
+
62
+ """
63
+ def fp8_get_quant_method(
64
+ self, layer: torch.nn.Module, prefix: str
65
+ ) -> Optional["QuantizeMethodBase"]:
66
+ if isinstance(layer, LinearBase):
67
+ if is_layer_skipped(prefix, self.ignored_layers):
68
+ return UnquantizedLinearMethod()
69
+ return Fp8LinearMethod(self)
70
+ elif isinstance(layer, FusedMoE):
71
+ return Fp8MoEMethod(self)
72
+ return None
73
+
74
+
75
+ setattr(Fp8Config, "get_quant_method", fp8_get_quant_method)
76
+ """
@@ -0,0 +1,122 @@
1
+ # Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/base_config.py
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ import torch
7
+ from torch import nn
8
+
9
+
10
+ class QuantizeMethodBase(ABC):
11
+ """Base class for different quantized methods."""
12
+
13
+ @abstractmethod
14
+ def create_weights(
15
+ self, layer: torch.nn.Module, *weight_args, **extra_weight_attrs
16
+ ):
17
+ """Create weights for a layer.
18
+
19
+ The weights will be set as attributes of the layer."""
20
+ raise NotImplementedError
21
+
22
+ @abstractmethod
23
+ def apply(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor:
24
+ """Apply the weights in layer to the input tensor.
25
+
26
+ Expects create_weights to have been called before on the layer."""
27
+ raise NotImplementedError
28
+
29
+ def process_weights_after_loading(self, layer: nn.Module) -> None:
30
+ """Process the weight after loading.
31
+
32
+ This can be used for example, to transpose weights for computation.
33
+ """
34
+ return
35
+
36
+
37
+ class QuantizationConfig(ABC):
38
+ """Base class for quantization configs."""
39
+
40
+ @abstractmethod
41
+ def get_name(self) -> str:
42
+ """Name of the quantization method."""
43
+ raise NotImplementedError
44
+
45
+ @abstractmethod
46
+ def get_supported_act_dtypes(self) -> List[torch.dtype]:
47
+ """List of supported activation dtypes."""
48
+ raise NotImplementedError
49
+
50
+ @classmethod
51
+ @abstractmethod
52
+ def get_min_capability(cls) -> int:
53
+ """Minimum GPU capability to support the quantization method.
54
+
55
+ E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
56
+ This requirement is due to the custom CUDA kernels used by the
57
+ quantization method.
58
+ """
59
+ raise NotImplementedError
60
+
61
+ @staticmethod
62
+ @abstractmethod
63
+ def get_config_filenames() -> List[str]:
64
+ """List of filenames to search for in the model directory."""
65
+ raise NotImplementedError
66
+
67
+ @classmethod
68
+ @abstractmethod
69
+ def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig":
70
+ """Create a config class from the model's quantization config."""
71
+ raise NotImplementedError
72
+
73
+ @classmethod
74
+ def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
75
+ """
76
+ Detects if this quantization method can support a given checkpoint
77
+ format by overriding the user specified quantization method --
78
+ this method should only be overwritten by subclasses in exceptional
79
+ circumstances
80
+ """
81
+ return None
82
+
83
+ @staticmethod
84
+ def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
85
+ """Get a value from the model's quantization config."""
86
+ for key in keys:
87
+ if key in config:
88
+ return config[key]
89
+ raise ValueError(
90
+ f"Cannot find any of {keys} in the model's " "quantization config."
91
+ )
92
+
93
+ @staticmethod
94
+ def get_from_keys_or(config: Dict[str, Any], keys: List[str], default: Any) -> Any:
95
+ """Get a optional value from the model's quantization config."""
96
+ try:
97
+ return QuantizationConfig.get_from_keys(config, keys)
98
+ except ValueError:
99
+ return default
100
+
101
+ @abstractmethod
102
+ def get_quant_method(
103
+ self, layer: torch.nn.Module, prefix: str
104
+ ) -> Optional[QuantizeMethodBase]:
105
+ """Get the quantize method to use for the quantized layer.
106
+
107
+ Args:
108
+ layer: The layer for the quant method.
109
+ prefix: The full name of the layer in the state dict
110
+ Returns:
111
+ The quantize method. None if the given layer doesn't support quant
112
+ method.
113
+ """
114
+ raise NotImplementedError
115
+
116
+ @abstractmethod
117
+ def get_scaled_act_names(self) -> List[str]:
118
+ """Returns the activation function names that should be post-scaled.
119
+
120
+ For now, this is only used by AWQ.
121
+ """
122
+ raise NotImplementedError
@@ -34,7 +34,6 @@ from vllm.model_executor.layers.linear import (
34
34
  QKVParallelLinear,
35
35
  RowParallelLinear,
36
36
  )
37
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
38
37
  from vllm.model_executor.layers.rotary_embedding import get_rope
39
38
  from vllm.model_executor.layers.vocab_parallel_embedding import (
40
39
  ParallelLMHead,
@@ -45,6 +44,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
45
44
  from sglang.srt.layers.activation import SiluAndMul
46
45
  from sglang.srt.layers.layernorm import RMSNorm
47
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
47
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
48
48
  from sglang.srt.layers.radix_attention import RadixAttention
49
49
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
50
50
 
@@ -24,12 +24,6 @@ from torch import nn
24
24
  from torch.nn import LayerNorm
25
25
  from vllm.config import CacheConfig
26
26
  from vllm.distributed import get_tensor_model_parallel_world_size
27
- from vllm.model_executor.layers.linear import (
28
- MergedColumnParallelLinear,
29
- QKVParallelLinear,
30
- RowParallelLinear,
31
- )
32
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
33
27
  from vllm.model_executor.layers.rotary_embedding import get_rope
34
28
  from vllm.model_executor.layers.vocab_parallel_embedding import (
35
29
  ParallelLMHead,
@@ -40,7 +34,13 @@ from vllm.transformers_utils.configs import ChatGLMConfig
40
34
 
41
35
  from sglang.srt.layers.activation import SiluAndMul
42
36
  from sglang.srt.layers.layernorm import RMSNorm
37
+ from sglang.srt.layers.linear import (
38
+ MergedColumnParallelLinear,
39
+ QKVParallelLinear,
40
+ RowParallelLinear,
41
+ )
43
42
  from sglang.srt.layers.logits_processor import LogitsProcessor
43
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
44
44
  from sglang.srt.layers.radix_attention import RadixAttention
45
45
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
46
46
 
@@ -50,21 +50,21 @@ from vllm.distributed import (
50
50
  get_tensor_model_parallel_rank,
51
51
  get_tensor_model_parallel_world_size,
52
52
  )
53
- from vllm.model_executor.layers.linear import (
54
- MergedColumnParallelLinear,
55
- QKVParallelLinear,
56
- RowParallelLinear,
57
- )
58
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
59
53
  from vllm.model_executor.layers.rotary_embedding import get_rope
60
54
  from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
61
55
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
62
- from vllm.model_executor.utils import set_weight_attrs
63
56
 
64
57
  from sglang.srt.layers.activation import SiluAndMul
58
+ from sglang.srt.layers.linear import (
59
+ MergedColumnParallelLinear,
60
+ QKVParallelLinear,
61
+ RowParallelLinear,
62
+ )
65
63
  from sglang.srt.layers.logits_processor import LogitsProcessor
64
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
66
65
  from sglang.srt.layers.radix_attention import RadixAttention
67
66
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
67
+ from sglang.srt.utils import set_weight_attrs
68
68
 
69
69
 
70
70
  @torch.compile
sglang/srt/models/dbrx.py CHANGED
@@ -27,12 +27,6 @@ from vllm.distributed import (
27
27
  tensor_model_parallel_all_reduce,
28
28
  )
29
29
  from vllm.model_executor.layers.fused_moe import fused_moe
30
- from vllm.model_executor.layers.linear import (
31
- QKVParallelLinear,
32
- ReplicatedLinear,
33
- RowParallelLinear,
34
- )
35
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
36
30
  from vllm.model_executor.layers.rotary_embedding import get_rope
37
31
  from vllm.model_executor.layers.vocab_parallel_embedding import (
38
32
  DEFAULT_VOCAB_PADDING_SIZE,
@@ -40,12 +34,18 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
40
34
  VocabParallelEmbedding,
41
35
  )
42
36
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
43
- from vllm.model_executor.utils import set_weight_attrs
44
37
  from vllm.transformers_utils.configs.dbrx import DbrxConfig
45
38
 
39
+ from sglang.srt.layers.linear import (
40
+ QKVParallelLinear,
41
+ ReplicatedLinear,
42
+ RowParallelLinear,
43
+ )
46
44
  from sglang.srt.layers.logits_processor import LogitsProcessor
45
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
47
46
  from sglang.srt.layers.radix_attention import RadixAttention
48
47
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
48
+ from sglang.srt.utils import set_weight_attrs
49
49
 
50
50
 
51
51
  class DbrxRouter(nn.Module):
@@ -28,13 +28,6 @@ from vllm.distributed import (
28
28
  tensor_model_parallel_all_reduce,
29
29
  )
30
30
  from vllm.model_executor.layers.fused_moe import fused_moe
31
- from vllm.model_executor.layers.linear import (
32
- MergedColumnParallelLinear,
33
- QKVParallelLinear,
34
- ReplicatedLinear,
35
- RowParallelLinear,
36
- )
37
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
38
31
  from vllm.model_executor.layers.rotary_embedding import get_rope
39
32
  from vllm.model_executor.layers.vocab_parallel_embedding import (
40
33
  ParallelLMHead,
@@ -44,7 +37,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
44
37
 
45
38
  from sglang.srt.layers.activation import SiluAndMul
46
39
  from sglang.srt.layers.layernorm import RMSNorm
40
+ from sglang.srt.layers.linear import (
41
+ MergedColumnParallelLinear,
42
+ QKVParallelLinear,
43
+ ReplicatedLinear,
44
+ RowParallelLinear,
45
+ )
47
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
47
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
48
48
  from sglang.srt.layers.radix_attention import RadixAttention
49
49
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
50
50
 
@@ -27,13 +27,6 @@ from vllm.distributed import (
27
27
  tensor_model_parallel_all_reduce,
28
28
  )
29
29
  from vllm.model_executor.layers.fused_moe import FusedMoE
30
- from vllm.model_executor.layers.linear import (
31
- ColumnParallelLinear,
32
- MergedColumnParallelLinear,
33
- ReplicatedLinear,
34
- RowParallelLinear,
35
- )
36
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
37
30
  from vllm.model_executor.layers.rotary_embedding import get_rope
38
31
  from vllm.model_executor.layers.vocab_parallel_embedding import (
39
32
  ParallelLMHead,
@@ -43,7 +36,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
43
36
 
44
37
  from sglang.srt.layers.activation import SiluAndMul
45
38
  from sglang.srt.layers.layernorm import RMSNorm
39
+ from sglang.srt.layers.linear import (
40
+ ColumnParallelLinear,
41
+ MergedColumnParallelLinear,
42
+ ReplicatedLinear,
43
+ RowParallelLinear,
44
+ )
46
45
  from sglang.srt.layers.logits_processor import LogitsProcessor
46
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
47
47
  from sglang.srt.layers.radix_attention import RadixAttention
48
48
  from sglang.srt.managers.schedule_batch import global_server_args_dict
49
49
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -23,12 +23,6 @@ import torch
23
23
  from torch import nn
24
24
  from vllm.config import CacheConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
- from vllm.model_executor.layers.linear import (
27
- MergedColumnParallelLinear,
28
- QKVParallelLinear,
29
- RowParallelLinear,
30
- )
31
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
32
26
  from vllm.model_executor.layers.rotary_embedding import get_rope
33
27
  from vllm.model_executor.layers.vocab_parallel_embedding import (
34
28
  ParallelLMHead,
@@ -38,7 +32,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
38
32
 
39
33
  from sglang.srt.layers.activation import SiluAndMul
40
34
  from sglang.srt.layers.layernorm import RMSNorm
35
+ from sglang.srt.layers.linear import (
36
+ MergedColumnParallelLinear,
37
+ QKVParallelLinear,
38
+ RowParallelLinear,
39
+ )
41
40
  from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
41
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
43
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
44
44
 
@@ -23,19 +23,19 @@ from torch import nn
23
23
  from transformers import PretrainedConfig
24
24
  from vllm.config import CacheConfig, LoRAConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
- from vllm.model_executor.layers.linear import (
27
- MergedColumnParallelLinear,
28
- QKVParallelLinear,
29
- RowParallelLinear,
30
- )
31
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
32
26
  from vllm.model_executor.layers.rotary_embedding import get_rope
33
27
  from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
34
28
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
35
29
 
36
30
  from sglang.srt.layers.activation import GeluAndMul
37
31
  from sglang.srt.layers.layernorm import RMSNorm
32
+ from sglang.srt.layers.linear import (
33
+ MergedColumnParallelLinear,
34
+ QKVParallelLinear,
35
+ RowParallelLinear,
36
+ )
38
37
  from sglang.srt.layers.logits_processor import LogitsProcessor
38
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
39
39
  from sglang.srt.layers.radix_attention import RadixAttention
40
40
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
41
41
 
@@ -22,12 +22,6 @@ from torch import nn
22
22
  from transformers import PretrainedConfig
23
23
  from vllm.config import CacheConfig, LoRAConfig
24
24
  from vllm.distributed import get_tensor_model_parallel_world_size
25
- from vllm.model_executor.layers.linear import (
26
- MergedColumnParallelLinear,
27
- QKVParallelLinear,
28
- RowParallelLinear,
29
- )
30
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
31
25
 
32
26
  # from vllm.model_executor.layers.rotary_embedding import GemmaRotaryEmbedding
33
27
  from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
@@ -35,7 +29,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
35
29
 
36
30
  from sglang.srt.layers.activation import GeluAndMul
37
31
  from sglang.srt.layers.layernorm import GemmaRMSNorm
32
+ from sglang.srt.layers.linear import (
33
+ MergedColumnParallelLinear,
34
+ QKVParallelLinear,
35
+ RowParallelLinear,
36
+ )
38
37
  from sglang.srt.layers.logits_processor import LogitsProcessor
38
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
39
39
  from sglang.srt.layers.radix_attention import RadixAttention
40
40
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
41
41
 
@@ -23,17 +23,17 @@ from torch import nn
23
23
  from transformers import GPTBigCodeConfig
24
24
  from vllm.config import CacheConfig, LoRAConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
- from vllm.model_executor.layers.linear import (
27
- ColumnParallelLinear,
28
- QKVParallelLinear,
29
- RowParallelLinear,
30
- )
31
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
32
26
  from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
33
27
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
34
28
 
35
29
  from sglang.srt.layers.activation import get_act_fn
30
+ from sglang.srt.layers.linear import (
31
+ ColumnParallelLinear,
32
+ QKVParallelLinear,
33
+ RowParallelLinear,
34
+ )
36
35
  from sglang.srt.layers.logits_processor import LogitsProcessor
36
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
37
37
  from sglang.srt.layers.radix_attention import RadixAttention
38
38
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
39
39
 
sglang/srt/models/grok.py CHANGED
@@ -28,12 +28,6 @@ from vllm.distributed import (
28
28
  get_tensor_model_parallel_rank,
29
29
  get_tensor_model_parallel_world_size,
30
30
  )
31
- from vllm.model_executor.layers.linear import (
32
- QKVParallelLinear,
33
- ReplicatedLinear,
34
- RowParallelLinear,
35
- )
36
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
37
31
  from vllm.model_executor.layers.rotary_embedding import get_rope
38
32
  from vllm.model_executor.layers.vocab_parallel_embedding import (
39
33
  ParallelLMHead,
@@ -44,7 +38,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
44
38
 
45
39
  from sglang.srt.layers.fused_moe import FusedMoE
46
40
  from sglang.srt.layers.layernorm import RMSNorm
41
+ from sglang.srt.layers.linear import (
42
+ QKVParallelLinear,
43
+ ReplicatedLinear,
44
+ RowParallelLinear,
45
+ )
47
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
47
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
48
48
  from sglang.srt.layers.radix_attention import RadixAttention
49
49
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
50
50
 
@@ -23,12 +23,6 @@ from torch import nn
23
23
  from transformers import PretrainedConfig
24
24
  from vllm.config import CacheConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
- from vllm.model_executor.layers.linear import (
27
- MergedColumnParallelLinear,
28
- QKVParallelLinear,
29
- RowParallelLinear,
30
- )
31
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
32
26
  from vllm.model_executor.layers.rotary_embedding import get_rope
33
27
  from vllm.model_executor.layers.vocab_parallel_embedding import (
34
28
  ParallelLMHead,
@@ -38,7 +32,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
38
32
 
39
33
  from sglang.srt.layers.activation import SiluAndMul
40
34
  from sglang.srt.layers.layernorm import RMSNorm
35
+ from sglang.srt.layers.linear import (
36
+ MergedColumnParallelLinear,
37
+ QKVParallelLinear,
38
+ RowParallelLinear,
39
+ )
41
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
43
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
44
44
 
@@ -24,12 +24,6 @@ from torch import nn
24
24
  from transformers import LlamaConfig
25
25
  from vllm.config import CacheConfig
26
26
  from vllm.distributed import get_tensor_model_parallel_world_size
27
- from vllm.model_executor.layers.linear import (
28
- MergedColumnParallelLinear,
29
- QKVParallelLinear,
30
- RowParallelLinear,
31
- )
32
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
33
27
  from vllm.model_executor.layers.rotary_embedding import get_rope
34
28
  from vllm.model_executor.layers.vocab_parallel_embedding import (
35
29
  ParallelLMHead,
@@ -39,7 +33,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
39
33
 
40
34
  from sglang.srt.layers.activation import SiluAndMul
41
35
  from sglang.srt.layers.layernorm import RMSNorm
36
+ from sglang.srt.layers.linear import (
37
+ MergedColumnParallelLinear,
38
+ QKVParallelLinear,
39
+ RowParallelLinear,
40
+ )
42
41
  from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
42
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
43
43
  from sglang.srt.layers.radix_attention import RadixAttention
44
44
  from sglang.srt.layers.torchao_utils import apply_torchao_config_
45
45
  from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -19,10 +19,10 @@ import torch
19
19
  from torch import nn
20
20
  from transformers import LlamaConfig
21
21
  from vllm.config import CacheConfig
22
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
23
22
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
24
23
 
25
24
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
25
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
26
26
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
27
27
  from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel
28
28
 
@@ -32,9 +32,9 @@ from transformers import (
32
32
  )
33
33
  from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
34
34
  from vllm.config import CacheConfig
35
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
36
35
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
37
36
 
37
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
38
38
  from sglang.srt.mm_utils import (
39
39
  get_anyres_image_grid_shape,
40
40
  unpad_image,
@@ -23,9 +23,9 @@ from torch import nn
23
23
  from transformers import CLIPVisionModel, LlavaConfig
24
24
  from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
25
25
  from vllm.config import CacheConfig
26
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
27
26
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
28
27
 
28
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
29
29
  from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
30
30
  from sglang.srt.models.llama import LlamaForCausalLM
31
31
 
@@ -22,12 +22,6 @@ import torch
22
22
  from torch import nn
23
23
  from vllm.config import CacheConfig
24
24
  from vllm.distributed import get_tensor_model_parallel_world_size
25
- from vllm.model_executor.layers.linear import (
26
- MergedColumnParallelLinear,
27
- QKVParallelLinear,
28
- RowParallelLinear,
29
- )
30
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
31
25
  from vllm.model_executor.layers.rotary_embedding import get_rope
32
26
  from vllm.model_executor.layers.vocab_parallel_embedding import (
33
27
  ParallelLMHead,
@@ -37,7 +31,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
37
31
 
38
32
  from sglang.srt.layers.activation import SiluAndMul
39
33
  from sglang.srt.layers.layernorm import RMSNorm
34
+ from sglang.srt.layers.linear import (
35
+ MergedColumnParallelLinear,
36
+ QKVParallelLinear,
37
+ RowParallelLinear,
38
+ )
40
39
  from sglang.srt.layers.logits_processor import LogitsProcessor
40
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
41
41
  from sglang.srt.layers.radix_attention import RadixAttention
42
42
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
43
43
 
@@ -29,7 +29,6 @@ from vllm.model_executor.layers.linear import (
29
29
  ReplicatedLinear,
30
30
  RowParallelLinear,
31
31
  )
32
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
33
32
  from vllm.model_executor.layers.rotary_embedding import get_rope
34
33
  from vllm.model_executor.layers.vocab_parallel_embedding import (
35
34
  ParallelLMHead,
@@ -40,6 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
40
39
  from sglang.srt.layers.activation import SiluAndMul
41
40
  from sglang.srt.layers.layernorm import RMSNorm
42
41
  from sglang.srt.layers.logits_processor import LogitsProcessor
42
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
43
43
  from sglang.srt.layers.radix_attention import RadixAttention
44
44
  from sglang.srt.managers.schedule_batch import global_server_args_dict
45
45
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -24,12 +24,6 @@ from transformers import MixtralConfig
24
24
  from vllm.config import CacheConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
26
  from vllm.model_executor.layers.fused_moe import FusedMoE
27
- from vllm.model_executor.layers.linear import (
28
- QKVParallelLinear,
29
- ReplicatedLinear,
30
- RowParallelLinear,
31
- )
32
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
33
27
  from vllm.model_executor.layers.rotary_embedding import get_rope
34
28
  from vllm.model_executor.layers.vocab_parallel_embedding import (
35
29
  DEFAULT_VOCAB_PADDING_SIZE,
@@ -39,7 +33,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
39
33
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
40
34
 
41
35
  from sglang.srt.layers.layernorm import RMSNorm
36
+ from sglang.srt.layers.linear import (
37
+ QKVParallelLinear,
38
+ ReplicatedLinear,
39
+ RowParallelLinear,
40
+ )
42
41
  from sglang.srt.layers.logits_processor import LogitsProcessor
42
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
43
43
  from sglang.srt.layers.radix_attention import RadixAttention
44
44
  from sglang.srt.layers.torchao_utils import apply_torchao_config_
45
45
  from sglang.srt.managers.schedule_batch import global_server_args_dict