sglang 0.4.1.post6__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +21 -23
- sglang/api.py +2 -7
- sglang/bench_offline_throughput.py +24 -16
- sglang/bench_one_batch.py +51 -3
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +37 -28
- sglang/lang/backend/runtime_endpoint.py +183 -4
- sglang/lang/chat_template.py +15 -4
- sglang/launch_server.py +1 -1
- sglang/srt/_custom_ops.py +80 -42
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/model_config.py +1 -0
- sglang/srt/constrained/base_grammar_backend.py +21 -0
- sglang/srt/constrained/xgrammar_backend.py +8 -4
- sglang/srt/conversation.py +14 -1
- sglang/srt/distributed/__init__.py +3 -3
- sglang/srt/distributed/communication_op.py +2 -1
- sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +107 -40
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
- sglang/srt/distributed/device_communicators/pynccl.py +80 -1
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
- sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
- sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
- sglang/srt/distributed/parallel_state.py +1 -1
- sglang/srt/distributed/utils.py +2 -1
- sglang/srt/entrypoints/engine.py +449 -0
- sglang/srt/entrypoints/http_server.py +579 -0
- sglang/srt/layers/activation.py +3 -3
- sglang/srt/layers/attention/flashinfer_backend.py +10 -9
- sglang/srt/layers/attention/triton_backend.py +4 -6
- sglang/srt/layers/attention/vision.py +204 -0
- sglang/srt/layers/dp_attention.py +69 -0
- sglang/srt/layers/linear.py +41 -5
- sglang/srt/layers/logits_processor.py +48 -63
- sglang/srt/layers/moe/ep_moe/layer.py +4 -4
- sglang/srt/layers/moe/fused_moe_native.py +69 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -6
- sglang/srt/layers/moe/fused_moe_triton/layer.py +29 -5
- sglang/srt/layers/parameter.py +2 -1
- sglang/srt/layers/quantization/__init__.py +20 -23
- sglang/srt/layers/quantization/fp8.py +6 -3
- sglang/srt/layers/quantization/modelopt_quant.py +1 -2
- sglang/srt/layers/quantization/w8a8_int8.py +1 -1
- sglang/srt/layers/radix_attention.py +2 -2
- sglang/srt/layers/rotary_embedding.py +1179 -31
- sglang/srt/layers/sampler.py +39 -1
- sglang/srt/layers/vocab_parallel_embedding.py +2 -2
- sglang/srt/lora/lora.py +1 -9
- sglang/srt/managers/configure_logging.py +3 -0
- sglang/srt/managers/data_parallel_controller.py +79 -72
- sglang/srt/managers/detokenizer_manager.py +23 -6
- sglang/srt/managers/image_processor.py +158 -2
- sglang/srt/managers/io_struct.py +25 -2
- sglang/srt/managers/schedule_batch.py +49 -22
- sglang/srt/managers/schedule_policy.py +26 -12
- sglang/srt/managers/scheduler.py +277 -178
- sglang/srt/managers/session_controller.py +1 -0
- sglang/srt/managers/tokenizer_manager.py +206 -121
- sglang/srt/managers/tp_worker.py +6 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
- sglang/srt/managers/utils.py +44 -0
- sglang/srt/mem_cache/memory_pool.py +10 -32
- sglang/srt/metrics/collector.py +15 -6
- sglang/srt/model_executor/cuda_graph_runner.py +4 -6
- sglang/srt/model_executor/model_runner.py +37 -15
- sglang/srt/model_loader/loader.py +8 -6
- sglang/srt/model_loader/weight_utils.py +55 -2
- sglang/srt/models/baichuan.py +6 -6
- sglang/srt/models/chatglm.py +2 -2
- sglang/srt/models/commandr.py +3 -3
- sglang/srt/models/dbrx.py +4 -4
- sglang/srt/models/deepseek.py +3 -3
- sglang/srt/models/deepseek_v2.py +8 -8
- sglang/srt/models/exaone.py +2 -2
- sglang/srt/models/gemma.py +2 -2
- sglang/srt/models/gemma2.py +6 -24
- sglang/srt/models/gpt2.py +3 -5
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/granite.py +2 -2
- sglang/srt/models/grok.py +3 -3
- sglang/srt/models/internlm2.py +2 -2
- sglang/srt/models/llama.py +7 -5
- sglang/srt/models/minicpm.py +2 -2
- sglang/srt/models/minicpm3.py +6 -6
- sglang/srt/models/minicpmv.py +1238 -0
- sglang/srt/models/mixtral.py +3 -3
- sglang/srt/models/mixtral_quant.py +3 -3
- sglang/srt/models/mllama.py +2 -2
- sglang/srt/models/olmo.py +3 -3
- sglang/srt/models/olmo2.py +4 -4
- sglang/srt/models/olmoe.py +7 -13
- sglang/srt/models/phi3_small.py +2 -2
- sglang/srt/models/qwen.py +2 -2
- sglang/srt/models/qwen2.py +41 -4
- sglang/srt/models/qwen2_moe.py +3 -3
- sglang/srt/models/qwen2_vl.py +22 -122
- sglang/srt/models/stablelm.py +2 -2
- sglang/srt/models/torch_native_llama.py +3 -3
- sglang/srt/models/xverse.py +6 -6
- sglang/srt/models/xverse_moe.py +6 -6
- sglang/srt/openai_api/protocol.py +2 -0
- sglang/srt/sampling/custom_logit_processor.py +38 -0
- sglang/srt/sampling/sampling_batch_info.py +139 -4
- sglang/srt/sampling/sampling_params.py +3 -1
- sglang/srt/server.py +4 -1090
- sglang/srt/server_args.py +57 -14
- sglang/srt/utils.py +103 -65
- sglang/test/runners.py +8 -13
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +3 -1
- sglang/utils.py +12 -2
- sglang/version.py +1 -1
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/METADATA +16 -5
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/RECORD +119 -115
- sglang/launch_server_llavavid.py +0 -25
- sglang/srt/constrained/__init__.py +0 -16
- sglang/srt/distributed/device_communicators/__init__.py +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/WHEEL +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/top_level.txt +0 -0
sglang/srt/models/exaone.py
CHANGED
@@ -20,9 +20,8 @@ from typing import Any, Dict, Iterable, Optional, Tuple
|
|
20
20
|
|
21
21
|
import torch
|
22
22
|
from torch import nn
|
23
|
-
from vllm.distributed import get_tensor_model_parallel_world_size
|
24
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
25
23
|
|
24
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
26
25
|
from sglang.srt.layers.activation import SiluAndMul
|
27
26
|
from sglang.srt.layers.layernorm import RMSNorm
|
28
27
|
from sglang.srt.layers.linear import (
|
@@ -33,6 +32,7 @@ from sglang.srt.layers.linear import (
|
|
33
32
|
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
34
33
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
35
34
|
from sglang.srt.layers.radix_attention import RadixAttention
|
35
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
36
36
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
37
37
|
ParallelLMHead,
|
38
38
|
VocabParallelEmbedding,
|
sglang/srt/models/gemma.py
CHANGED
@@ -21,9 +21,8 @@ from typing import Iterable, Optional, Tuple
|
|
21
21
|
import torch
|
22
22
|
from torch import nn
|
23
23
|
from transformers import PretrainedConfig
|
24
|
-
from vllm.distributed import get_tensor_model_parallel_world_size
|
25
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
26
24
|
|
25
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
27
26
|
from sglang.srt.layers.activation import GeluAndMul
|
28
27
|
from sglang.srt.layers.layernorm import RMSNorm
|
29
28
|
from sglang.srt.layers.linear import (
|
@@ -34,6 +33,7 @@ from sglang.srt.layers.linear import (
|
|
34
33
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
35
34
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
36
35
|
from sglang.srt.layers.radix_attention import RadixAttention
|
36
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
37
37
|
from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
38
38
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
39
39
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
sglang/srt/models/gemma2.py
CHANGED
@@ -15,13 +15,13 @@
|
|
15
15
|
# Adapted from:
|
16
16
|
# https://github.com/vllm-project/vllm/blob/56b325e977435af744f8b3dca7af0ca209663558/vllm/model_executor/models/gemma2.py
|
17
17
|
|
18
|
-
from typing import Iterable, Optional, Set, Tuple
|
18
|
+
from typing import Iterable, Optional, Set, Tuple
|
19
19
|
|
20
20
|
import torch
|
21
21
|
from torch import nn
|
22
22
|
from transformers import PretrainedConfig
|
23
|
-
from vllm.distributed import get_tensor_model_parallel_world_size
|
24
23
|
|
24
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
25
25
|
from sglang.srt.layers.activation import GeluAndMul
|
26
26
|
from sglang.srt.layers.layernorm import GemmaRMSNorm
|
27
27
|
from sglang.srt.layers.linear import (
|
@@ -32,6 +32,7 @@ from sglang.srt.layers.linear import (
|
|
32
32
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
33
33
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
34
34
|
from sglang.srt.layers.radix_attention import RadixAttention
|
35
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
35
36
|
from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
36
37
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
37
38
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
@@ -44,23 +45,6 @@ def get_attention_sliding_window_size(config):
|
|
44
45
|
return config.sliding_window - 1
|
45
46
|
|
46
47
|
|
47
|
-
# FIXME: temporary solution, remove after next vllm release
|
48
|
-
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
|
49
|
-
|
50
|
-
|
51
|
-
class GemmaRotaryEmbedding(RotaryEmbedding):
|
52
|
-
def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
|
53
|
-
# https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/gemma/modeling_gemma.py#L107
|
54
|
-
inv_freq = 1.0 / (
|
55
|
-
base
|
56
|
-
** (
|
57
|
-
torch.arange(0, self.rotary_dim, 2, dtype=torch.int64).float()
|
58
|
-
/ self.rotary_dim
|
59
|
-
)
|
60
|
-
)
|
61
|
-
return inv_freq
|
62
|
-
|
63
|
-
|
64
48
|
class Gemma2MLP(nn.Module):
|
65
49
|
def __init__(
|
66
50
|
self,
|
@@ -143,14 +127,12 @@ class Gemma2Attention(nn.Module):
|
|
143
127
|
bias=config.attention_bias,
|
144
128
|
quant_config=quant_config,
|
145
129
|
)
|
146
|
-
|
147
|
-
self.rotary_emb = GemmaRotaryEmbedding(
|
148
|
-
self.head_dim,
|
130
|
+
self.rotary_emb = get_rope(
|
149
131
|
self.head_dim,
|
150
|
-
|
132
|
+
rotary_dim=self.head_dim,
|
133
|
+
max_position=max_position_embeddings,
|
151
134
|
base=self.rope_theta,
|
152
135
|
is_neox_style=True,
|
153
|
-
dtype=torch.get_default_dtype(),
|
154
136
|
)
|
155
137
|
|
156
138
|
use_sliding_window = layer_id % 2 == 0 and hasattr(config, "sliding_window")
|
sglang/srt/models/gpt2.py
CHANGED
@@ -17,16 +17,14 @@
|
|
17
17
|
# See the License for the specific language governing permissions and
|
18
18
|
# limitations under the License.
|
19
19
|
"""Inference-only GPT-2 model compatible with HuggingFace weights."""
|
20
|
-
from typing import Iterable,
|
20
|
+
from typing import Iterable, Optional, Tuple
|
21
21
|
|
22
22
|
import torch
|
23
23
|
from torch import nn
|
24
24
|
from transformers import GPT2Config
|
25
|
-
from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
|
26
|
-
from vllm.model_executor.layers.activation import get_act_fn
|
27
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
28
25
|
|
29
|
-
|
26
|
+
from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_world_size
|
27
|
+
from sglang.srt.layers.activation import get_act_fn
|
30
28
|
from sglang.srt.layers.linear import (
|
31
29
|
ColumnParallelLinear,
|
32
30
|
QKVParallelLinear,
|
sglang/srt/models/gpt_bigcode.py
CHANGED
@@ -21,8 +21,8 @@ from typing import Iterable, Optional, Tuple
|
|
21
21
|
import torch
|
22
22
|
from torch import nn
|
23
23
|
from transformers import GPTBigCodeConfig
|
24
|
-
from vllm.distributed import get_tensor_model_parallel_world_size
|
25
24
|
|
25
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
26
26
|
from sglang.srt.layers.activation import get_act_fn
|
27
27
|
from sglang.srt.layers.linear import (
|
28
28
|
ColumnParallelLinear,
|
sglang/srt/models/granite.py
CHANGED
@@ -22,9 +22,8 @@ from typing import Any, Dict, Iterable, Optional, Tuple
|
|
22
22
|
import torch
|
23
23
|
from torch import nn
|
24
24
|
from transformers import GraniteConfig
|
25
|
-
from vllm.distributed import get_tensor_model_parallel_world_size
|
26
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
27
25
|
|
26
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
28
27
|
from sglang.srt.layers.activation import SiluAndMul
|
29
28
|
from sglang.srt.layers.layernorm import RMSNorm
|
30
29
|
from sglang.srt.layers.linear import (
|
@@ -36,6 +35,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorO
|
|
36
35
|
from sglang.srt.layers.pooler import Pooler, PoolingType
|
37
36
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
38
37
|
from sglang.srt.layers.radix_attention import RadixAttention
|
38
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
39
39
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
40
40
|
ParallelLMHead,
|
41
41
|
VocabParallelEmbedding,
|
sglang/srt/models/grok.py
CHANGED
@@ -22,12 +22,11 @@ import torch
|
|
22
22
|
import torch.nn.functional as F
|
23
23
|
from torch import nn
|
24
24
|
from transformers import PretrainedConfig
|
25
|
-
|
25
|
+
|
26
|
+
from sglang.srt.distributed import (
|
26
27
|
get_tensor_model_parallel_rank,
|
27
28
|
get_tensor_model_parallel_world_size,
|
28
29
|
)
|
29
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
30
|
-
|
31
30
|
from sglang.srt.layers.activation import GeluAndMul
|
32
31
|
from sglang.srt.layers.layernorm import RMSNorm
|
33
32
|
from sglang.srt.layers.linear import (
|
@@ -40,6 +39,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
40
39
|
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
41
40
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
41
|
from sglang.srt.layers.radix_attention import RadixAttention
|
42
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
43
43
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
44
44
|
ParallelLMHead,
|
45
45
|
VocabParallelEmbedding,
|
sglang/srt/models/internlm2.py
CHANGED
@@ -19,9 +19,8 @@ from typing import Any, Dict, Iterable, Optional, Tuple
|
|
19
19
|
import torch
|
20
20
|
from torch import nn
|
21
21
|
from transformers import PretrainedConfig
|
22
|
-
from vllm.distributed import get_tensor_model_parallel_world_size
|
23
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
24
22
|
|
23
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
25
24
|
from sglang.srt.layers.activation import SiluAndMul
|
26
25
|
from sglang.srt.layers.layernorm import RMSNorm
|
27
26
|
from sglang.srt.layers.linear import (
|
@@ -32,6 +31,7 @@ from sglang.srt.layers.linear import (
|
|
32
31
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
33
32
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
34
33
|
from sglang.srt.layers.radix_attention import RadixAttention
|
34
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
35
35
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
36
36
|
ParallelLMHead,
|
37
37
|
VocabParallelEmbedding,
|
sglang/srt/models/llama.py
CHANGED
@@ -22,13 +22,11 @@ from typing import Any, Dict, Iterable, Optional, Tuple
|
|
22
22
|
import torch
|
23
23
|
from torch import nn
|
24
24
|
from transformers import LlamaConfig
|
25
|
-
|
25
|
+
|
26
|
+
from sglang.srt.distributed import (
|
26
27
|
get_tensor_model_parallel_rank,
|
27
28
|
get_tensor_model_parallel_world_size,
|
28
29
|
)
|
29
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
30
|
-
from vllm.model_executor.model_loader.weight_utils import kv_cache_scales_loader
|
31
|
-
|
32
30
|
from sglang.srt.layers.activation import SiluAndMul
|
33
31
|
from sglang.srt.layers.layernorm import RMSNorm
|
34
32
|
from sglang.srt.layers.linear import (
|
@@ -40,12 +38,16 @@ from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorO
|
|
40
38
|
from sglang.srt.layers.pooler import Pooler, PoolingType
|
41
39
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
40
|
from sglang.srt.layers.radix_attention import RadixAttention
|
41
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
43
42
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
44
43
|
ParallelLMHead,
|
45
44
|
VocabParallelEmbedding,
|
46
45
|
)
|
47
46
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
48
|
-
from sglang.srt.model_loader.weight_utils import
|
47
|
+
from sglang.srt.model_loader.weight_utils import (
|
48
|
+
default_weight_loader,
|
49
|
+
kv_cache_scales_loader,
|
50
|
+
)
|
49
51
|
from sglang.srt.utils import make_layers
|
50
52
|
from sglang.utils import get_exception_traceback
|
51
53
|
|
sglang/srt/models/minicpm.py
CHANGED
@@ -18,9 +18,8 @@ from typing import Any, Dict, Iterable, Optional, Tuple
|
|
18
18
|
|
19
19
|
import torch
|
20
20
|
from torch import nn
|
21
|
-
from vllm.distributed import get_tensor_model_parallel_world_size
|
22
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
23
21
|
|
22
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
24
23
|
from sglang.srt.layers.activation import SiluAndMul
|
25
24
|
from sglang.srt.layers.layernorm import RMSNorm
|
26
25
|
from sglang.srt.layers.linear import (
|
@@ -31,6 +30,7 @@ from sglang.srt.layers.linear import (
|
|
31
30
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
32
31
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
33
32
|
from sglang.srt.layers.radix_attention import RadixAttention
|
33
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
34
34
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
35
35
|
ParallelLMHead,
|
36
36
|
VocabParallelEmbedding,
|
sglang/srt/models/minicpm3.py
CHANGED
@@ -19,20 +19,20 @@ from typing import Any, Dict, Iterable, Optional, Tuple
|
|
19
19
|
import torch
|
20
20
|
from torch import nn
|
21
21
|
from transformers import PretrainedConfig
|
22
|
-
|
23
|
-
from
|
22
|
+
|
23
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
24
|
+
from sglang.srt.layers.activation import SiluAndMul
|
25
|
+
from sglang.srt.layers.layernorm import RMSNorm
|
26
|
+
from sglang.srt.layers.linear import (
|
24
27
|
ColumnParallelLinear,
|
25
28
|
MergedColumnParallelLinear,
|
26
29
|
ReplicatedLinear,
|
27
30
|
RowParallelLinear,
|
28
31
|
)
|
29
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
30
|
-
|
31
|
-
from sglang.srt.layers.activation import SiluAndMul
|
32
|
-
from sglang.srt.layers.layernorm import RMSNorm
|
33
32
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
34
33
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
35
34
|
from sglang.srt.layers.radix_attention import RadixAttention
|
35
|
+
from sglang.srt.layers.rotary_embedding import get_rope
|
36
36
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
37
37
|
ParallelLMHead,
|
38
38
|
VocabParallelEmbedding,
|