sglang 0.4.1.post6__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +21 -23
- sglang/api.py +2 -7
- sglang/bench_offline_throughput.py +24 -16
- sglang/bench_one_batch.py +51 -3
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +37 -28
- sglang/lang/backend/runtime_endpoint.py +183 -4
- sglang/lang/chat_template.py +15 -4
- sglang/launch_server.py +1 -1
- sglang/srt/_custom_ops.py +80 -42
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/model_config.py +1 -0
- sglang/srt/constrained/base_grammar_backend.py +21 -0
- sglang/srt/constrained/xgrammar_backend.py +8 -4
- sglang/srt/conversation.py +14 -1
- sglang/srt/distributed/__init__.py +3 -3
- sglang/srt/distributed/communication_op.py +2 -1
- sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +107 -40
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
- sglang/srt/distributed/device_communicators/pynccl.py +80 -1
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
- sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
- sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
- sglang/srt/distributed/parallel_state.py +1 -1
- sglang/srt/distributed/utils.py +2 -1
- sglang/srt/entrypoints/engine.py +449 -0
- sglang/srt/entrypoints/http_server.py +579 -0
- sglang/srt/layers/activation.py +3 -3
- sglang/srt/layers/attention/flashinfer_backend.py +10 -9
- sglang/srt/layers/attention/triton_backend.py +4 -6
- sglang/srt/layers/attention/vision.py +204 -0
- sglang/srt/layers/dp_attention.py +69 -0
- sglang/srt/layers/linear.py +41 -5
- sglang/srt/layers/logits_processor.py +48 -63
- sglang/srt/layers/moe/ep_moe/layer.py +4 -4
- sglang/srt/layers/moe/fused_moe_native.py +69 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -6
- sglang/srt/layers/moe/fused_moe_triton/layer.py +29 -5
- sglang/srt/layers/parameter.py +2 -1
- sglang/srt/layers/quantization/__init__.py +20 -23
- sglang/srt/layers/quantization/fp8.py +6 -3
- sglang/srt/layers/quantization/modelopt_quant.py +1 -2
- sglang/srt/layers/quantization/w8a8_int8.py +1 -1
- sglang/srt/layers/radix_attention.py +2 -2
- sglang/srt/layers/rotary_embedding.py +1179 -31
- sglang/srt/layers/sampler.py +39 -1
- sglang/srt/layers/vocab_parallel_embedding.py +2 -2
- sglang/srt/lora/lora.py +1 -9
- sglang/srt/managers/configure_logging.py +3 -0
- sglang/srt/managers/data_parallel_controller.py +79 -72
- sglang/srt/managers/detokenizer_manager.py +23 -6
- sglang/srt/managers/image_processor.py +158 -2
- sglang/srt/managers/io_struct.py +25 -2
- sglang/srt/managers/schedule_batch.py +49 -22
- sglang/srt/managers/schedule_policy.py +26 -12
- sglang/srt/managers/scheduler.py +277 -178
- sglang/srt/managers/session_controller.py +1 -0
- sglang/srt/managers/tokenizer_manager.py +206 -121
- sglang/srt/managers/tp_worker.py +6 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
- sglang/srt/managers/utils.py +44 -0
- sglang/srt/mem_cache/memory_pool.py +10 -32
- sglang/srt/metrics/collector.py +15 -6
- sglang/srt/model_executor/cuda_graph_runner.py +4 -6
- sglang/srt/model_executor/model_runner.py +37 -15
- sglang/srt/model_loader/loader.py +8 -6
- sglang/srt/model_loader/weight_utils.py +55 -2
- sglang/srt/models/baichuan.py +6 -6
- sglang/srt/models/chatglm.py +2 -2
- sglang/srt/models/commandr.py +3 -3
- sglang/srt/models/dbrx.py +4 -4
- sglang/srt/models/deepseek.py +3 -3
- sglang/srt/models/deepseek_v2.py +8 -8
- sglang/srt/models/exaone.py +2 -2
- sglang/srt/models/gemma.py +2 -2
- sglang/srt/models/gemma2.py +6 -24
- sglang/srt/models/gpt2.py +3 -5
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/granite.py +2 -2
- sglang/srt/models/grok.py +3 -3
- sglang/srt/models/internlm2.py +2 -2
- sglang/srt/models/llama.py +7 -5
- sglang/srt/models/minicpm.py +2 -2
- sglang/srt/models/minicpm3.py +6 -6
- sglang/srt/models/minicpmv.py +1238 -0
- sglang/srt/models/mixtral.py +3 -3
- sglang/srt/models/mixtral_quant.py +3 -3
- sglang/srt/models/mllama.py +2 -2
- sglang/srt/models/olmo.py +3 -3
- sglang/srt/models/olmo2.py +4 -4
- sglang/srt/models/olmoe.py +7 -13
- sglang/srt/models/phi3_small.py +2 -2
- sglang/srt/models/qwen.py +2 -2
- sglang/srt/models/qwen2.py +41 -4
- sglang/srt/models/qwen2_moe.py +3 -3
- sglang/srt/models/qwen2_vl.py +22 -122
- sglang/srt/models/stablelm.py +2 -2
- sglang/srt/models/torch_native_llama.py +3 -3
- sglang/srt/models/xverse.py +6 -6
- sglang/srt/models/xverse_moe.py +6 -6
- sglang/srt/openai_api/protocol.py +2 -0
- sglang/srt/sampling/custom_logit_processor.py +38 -0
- sglang/srt/sampling/sampling_batch_info.py +139 -4
- sglang/srt/sampling/sampling_params.py +3 -1
- sglang/srt/server.py +4 -1090
- sglang/srt/server_args.py +57 -14
- sglang/srt/utils.py +103 -65
- sglang/test/runners.py +8 -13
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +3 -1
- sglang/utils.py +12 -2
- sglang/version.py +1 -1
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/METADATA +16 -5
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/RECORD +119 -115
- sglang/launch_server_llavavid.py +0 -25
- sglang/srt/constrained/__init__.py +0 -16
- sglang/srt/distributed/device_communicators/__init__.py +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/WHEEL +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/top_level.txt +0 -0
@@ -5,14 +5,15 @@ from enum import Enum
|
|
5
5
|
from typing import Callable, List, Optional, Tuple
|
6
6
|
|
7
7
|
import torch
|
8
|
-
from vllm.
|
8
|
+
from vllm.model_executor.custom_op import CustomOp
|
9
|
+
|
10
|
+
from sglang.srt.distributed import (
|
9
11
|
get_tensor_model_parallel_rank,
|
10
12
|
get_tensor_model_parallel_world_size,
|
11
13
|
tensor_model_parallel_all_reduce,
|
12
14
|
)
|
13
|
-
from vllm.model_executor.custom_op import CustomOp
|
14
|
-
|
15
15
|
from sglang.srt.layers.custom_op_util import register_custom_op
|
16
|
+
from sglang.srt.layers.moe.fused_moe_native import moe_forward_native
|
16
17
|
from sglang.srt.layers.moe.topk import select_experts
|
17
18
|
from sglang.srt.layers.quantization.base_config import (
|
18
19
|
QuantizationConfig,
|
@@ -185,8 +186,31 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
185
186
|
inplace=True,
|
186
187
|
)
|
187
188
|
|
188
|
-
def forward_cpu(
|
189
|
-
|
189
|
+
def forward_cpu(
|
190
|
+
self,
|
191
|
+
layer: torch.nn.Module,
|
192
|
+
x: torch.Tensor,
|
193
|
+
use_grouped_topk: bool,
|
194
|
+
top_k: int,
|
195
|
+
router_logits: torch.Tensor,
|
196
|
+
renormalize: bool,
|
197
|
+
topk_group: Optional[int] = None,
|
198
|
+
num_expert_group: Optional[int] = None,
|
199
|
+
custom_routing_function: Optional[Callable] = None,
|
200
|
+
correction_bias: Optional[torch.Tensor] = None,
|
201
|
+
) -> torch.Tensor:
|
202
|
+
return moe_forward_native(
|
203
|
+
layer,
|
204
|
+
x,
|
205
|
+
use_grouped_topk,
|
206
|
+
top_k,
|
207
|
+
router_logits,
|
208
|
+
renormalize,
|
209
|
+
topk_group,
|
210
|
+
num_expert_group,
|
211
|
+
custom_routing_function,
|
212
|
+
correction_bias,
|
213
|
+
)
|
190
214
|
|
191
215
|
def forward_tpu(self, *args, **kwargs) -> torch.Tensor:
|
192
216
|
raise NotImplementedError("The TPU backend currently does not support MoE.")
|
sglang/srt/layers/parameter.py
CHANGED
@@ -6,7 +6,8 @@ from typing import Callable, Optional, Union
|
|
6
6
|
|
7
7
|
import torch
|
8
8
|
from torch.nn import Parameter
|
9
|
-
|
9
|
+
|
10
|
+
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
10
11
|
|
11
12
|
__all__ = [
|
12
13
|
"BasevLLMParameter",
|
@@ -56,33 +56,13 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
|
|
56
56
|
return QUANTIZATION_METHODS[quantization]
|
57
57
|
|
58
58
|
|
59
|
-
def fp8_get_quant_method(self, layer, prefix):
|
60
|
-
"""Enhanced get_quant_method for FP8 config."""
|
61
|
-
from vllm.model_executor.layers.linear import LinearBase
|
62
|
-
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
63
|
-
is_layer_skipped,
|
64
|
-
)
|
65
|
-
|
66
|
-
from sglang.srt.layers.linear import UnquantizedLinearMethod
|
67
|
-
from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
|
68
|
-
from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod, Fp8MoEMethod
|
69
|
-
|
70
|
-
if isinstance(layer, LinearBase):
|
71
|
-
if is_layer_skipped(prefix, self.ignored_layers):
|
72
|
-
return UnquantizedLinearMethod()
|
73
|
-
return Fp8LinearMethod(self)
|
74
|
-
elif isinstance(layer, FusedMoE):
|
75
|
-
return Fp8MoEMethod(self)
|
76
|
-
return None
|
77
|
-
|
78
|
-
|
79
59
|
def gptq_get_quant_method(self, layer, prefix):
|
80
|
-
from vllm.model_executor.layers.linear import LinearBase
|
81
60
|
from vllm.model_executor.layers.quantization.gptq_marlin import (
|
82
61
|
GPTQMarlinLinearMethod,
|
83
62
|
GPTQMarlinMoEMethod,
|
84
63
|
)
|
85
64
|
|
65
|
+
from sglang.srt.layers.linear import LinearBase
|
86
66
|
from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
|
87
67
|
|
88
68
|
if isinstance(layer, LinearBase):
|
@@ -93,12 +73,12 @@ def gptq_get_quant_method(self, layer, prefix):
|
|
93
73
|
|
94
74
|
|
95
75
|
def awq_get_quant_method(self, layer, prefix):
|
96
|
-
from vllm.model_executor.layers.linear import LinearBase
|
97
76
|
from vllm.model_executor.layers.quantization.awq_marlin import (
|
98
77
|
AWQMarlinLinearMethod,
|
99
78
|
AWQMoEMethod,
|
100
79
|
)
|
101
80
|
|
81
|
+
from sglang.srt.layers.linear import LinearBase
|
102
82
|
from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
|
103
83
|
|
104
84
|
if isinstance(layer, LinearBase):
|
@@ -108,13 +88,30 @@ def awq_get_quant_method(self, layer, prefix):
|
|
108
88
|
return None
|
109
89
|
|
110
90
|
|
91
|
+
def patch_vllm_linear_base_isinstance():
|
92
|
+
import builtins
|
93
|
+
|
94
|
+
from vllm.model_executor.layers.linear import LinearBase
|
95
|
+
|
96
|
+
from sglang.srt.layers.linear import LinearBase as PatchedLinearBase
|
97
|
+
|
98
|
+
original_isinstance = builtins.isinstance
|
99
|
+
|
100
|
+
def patched_isinstance(obj, classinfo):
|
101
|
+
if classinfo is LinearBase:
|
102
|
+
return original_isinstance(obj, PatchedLinearBase)
|
103
|
+
return original_isinstance(obj, classinfo)
|
104
|
+
|
105
|
+
builtins.isinstance = patched_isinstance
|
106
|
+
|
107
|
+
|
111
108
|
def apply_monkey_patches():
|
112
109
|
"""Apply all monkey patches in one place."""
|
113
|
-
setattr(Fp8Config, "get_quant_method", fp8_get_quant_method)
|
114
110
|
setattr(GPTQMarlinConfig, "get_quant_method", gptq_get_quant_method)
|
115
111
|
setattr(AWQMarlinConfig, "get_quant_method", awq_get_quant_method)
|
116
112
|
|
117
113
|
|
114
|
+
patch_vllm_linear_base_isinstance()
|
118
115
|
# Apply patches when module is imported
|
119
116
|
apply_monkey_patches()
|
120
117
|
|
@@ -8,8 +8,6 @@ import torch.nn.functional as F
|
|
8
8
|
from torch.nn import Module
|
9
9
|
from torch.nn.parameter import Parameter
|
10
10
|
from vllm import _custom_ops as ops
|
11
|
-
from vllm.distributed import get_tensor_model_parallel_world_size
|
12
|
-
from vllm.model_executor.layers.linear import LinearBase
|
13
11
|
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
14
12
|
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
|
15
13
|
apply_fp8_marlin_linear,
|
@@ -25,7 +23,12 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
|
25
23
|
requantize_with_max_scale,
|
26
24
|
)
|
27
25
|
|
28
|
-
from sglang.srt.
|
26
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
27
|
+
from sglang.srt.layers.linear import (
|
28
|
+
LinearBase,
|
29
|
+
LinearMethodBase,
|
30
|
+
UnquantizedLinearMethod,
|
31
|
+
)
|
29
32
|
from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
|
30
33
|
from sglang.srt.layers.quantization.base_config import (
|
31
34
|
QuantizationConfig,
|
@@ -5,14 +5,13 @@ from typing import Any, Dict, List, Optional
|
|
5
5
|
|
6
6
|
import torch
|
7
7
|
from torch.nn.parameter import Parameter
|
8
|
-
from vllm.model_executor.layers.linear import LinearBase
|
9
8
|
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
10
9
|
apply_fp8_linear,
|
11
10
|
cutlass_fp8_supported,
|
12
11
|
requantize_with_max_scale,
|
13
12
|
)
|
14
13
|
|
15
|
-
from sglang.srt.layers.linear import LinearMethodBase
|
14
|
+
from sglang.srt.layers.linear import LinearBase, LinearMethodBase
|
16
15
|
from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
|
17
16
|
from sglang.srt.layers.quantization.base_config import (
|
18
17
|
QuantizationConfig,
|
@@ -54,7 +54,7 @@ class W8A8Int8Config(QuantizationConfig):
|
|
54
54
|
layer: torch.nn.Module,
|
55
55
|
prefix: str,
|
56
56
|
) -> Optional["QuantizeMethodBase"]:
|
57
|
-
from
|
57
|
+
from sglang.srt.layers.linear import LinearBase
|
58
58
|
|
59
59
|
if isinstance(layer, LinearBase):
|
60
60
|
return W8A8Int8LinearMethod(self)
|
@@ -47,8 +47,8 @@ class RadixAttention(nn.Module):
|
|
47
47
|
self.logit_cap = logit_cap
|
48
48
|
self.sliding_window_size = sliding_window_size or -1
|
49
49
|
self.is_cross_attention = is_cross_attention
|
50
|
-
self.k_scale =
|
51
|
-
self.v_scale =
|
50
|
+
self.k_scale = None
|
51
|
+
self.v_scale = None
|
52
52
|
|
53
53
|
def forward(
|
54
54
|
self,
|