sglang 0.4.1.post6__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. sglang/__init__.py +21 -23
  2. sglang/api.py +2 -7
  3. sglang/bench_offline_throughput.py +24 -16
  4. sglang/bench_one_batch.py +51 -3
  5. sglang/bench_one_batch_server.py +1 -1
  6. sglang/bench_serving.py +37 -28
  7. sglang/lang/backend/runtime_endpoint.py +183 -4
  8. sglang/lang/chat_template.py +15 -4
  9. sglang/launch_server.py +1 -1
  10. sglang/srt/_custom_ops.py +80 -42
  11. sglang/srt/configs/device_config.py +1 -1
  12. sglang/srt/configs/model_config.py +1 -0
  13. sglang/srt/constrained/base_grammar_backend.py +21 -0
  14. sglang/srt/constrained/xgrammar_backend.py +8 -4
  15. sglang/srt/conversation.py +14 -1
  16. sglang/srt/distributed/__init__.py +3 -3
  17. sglang/srt/distributed/communication_op.py +2 -1
  18. sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
  19. sglang/srt/distributed/device_communicators/custom_all_reduce.py +107 -40
  20. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  21. sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
  22. sglang/srt/distributed/device_communicators/pynccl.py +80 -1
  23. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
  24. sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
  25. sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
  26. sglang/srt/distributed/parallel_state.py +1 -1
  27. sglang/srt/distributed/utils.py +2 -1
  28. sglang/srt/entrypoints/engine.py +449 -0
  29. sglang/srt/entrypoints/http_server.py +579 -0
  30. sglang/srt/layers/activation.py +3 -3
  31. sglang/srt/layers/attention/flashinfer_backend.py +10 -9
  32. sglang/srt/layers/attention/triton_backend.py +4 -6
  33. sglang/srt/layers/attention/vision.py +204 -0
  34. sglang/srt/layers/dp_attention.py +69 -0
  35. sglang/srt/layers/linear.py +41 -5
  36. sglang/srt/layers/logits_processor.py +48 -63
  37. sglang/srt/layers/moe/ep_moe/layer.py +4 -4
  38. sglang/srt/layers/moe/fused_moe_native.py +69 -0
  39. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -6
  40. sglang/srt/layers/moe/fused_moe_triton/layer.py +29 -5
  41. sglang/srt/layers/parameter.py +2 -1
  42. sglang/srt/layers/quantization/__init__.py +20 -23
  43. sglang/srt/layers/quantization/fp8.py +6 -3
  44. sglang/srt/layers/quantization/modelopt_quant.py +1 -2
  45. sglang/srt/layers/quantization/w8a8_int8.py +1 -1
  46. sglang/srt/layers/radix_attention.py +2 -2
  47. sglang/srt/layers/rotary_embedding.py +1179 -31
  48. sglang/srt/layers/sampler.py +39 -1
  49. sglang/srt/layers/vocab_parallel_embedding.py +2 -2
  50. sglang/srt/lora/lora.py +1 -9
  51. sglang/srt/managers/configure_logging.py +3 -0
  52. sglang/srt/managers/data_parallel_controller.py +79 -72
  53. sglang/srt/managers/detokenizer_manager.py +23 -6
  54. sglang/srt/managers/image_processor.py +158 -2
  55. sglang/srt/managers/io_struct.py +25 -2
  56. sglang/srt/managers/schedule_batch.py +49 -22
  57. sglang/srt/managers/schedule_policy.py +26 -12
  58. sglang/srt/managers/scheduler.py +277 -178
  59. sglang/srt/managers/session_controller.py +1 -0
  60. sglang/srt/managers/tokenizer_manager.py +206 -121
  61. sglang/srt/managers/tp_worker.py +6 -4
  62. sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
  63. sglang/srt/managers/utils.py +44 -0
  64. sglang/srt/mem_cache/memory_pool.py +10 -32
  65. sglang/srt/metrics/collector.py +15 -6
  66. sglang/srt/model_executor/cuda_graph_runner.py +4 -6
  67. sglang/srt/model_executor/model_runner.py +37 -15
  68. sglang/srt/model_loader/loader.py +8 -6
  69. sglang/srt/model_loader/weight_utils.py +55 -2
  70. sglang/srt/models/baichuan.py +6 -6
  71. sglang/srt/models/chatglm.py +2 -2
  72. sglang/srt/models/commandr.py +3 -3
  73. sglang/srt/models/dbrx.py +4 -4
  74. sglang/srt/models/deepseek.py +3 -3
  75. sglang/srt/models/deepseek_v2.py +8 -8
  76. sglang/srt/models/exaone.py +2 -2
  77. sglang/srt/models/gemma.py +2 -2
  78. sglang/srt/models/gemma2.py +6 -24
  79. sglang/srt/models/gpt2.py +3 -5
  80. sglang/srt/models/gpt_bigcode.py +1 -1
  81. sglang/srt/models/granite.py +2 -2
  82. sglang/srt/models/grok.py +3 -3
  83. sglang/srt/models/internlm2.py +2 -2
  84. sglang/srt/models/llama.py +7 -5
  85. sglang/srt/models/minicpm.py +2 -2
  86. sglang/srt/models/minicpm3.py +6 -6
  87. sglang/srt/models/minicpmv.py +1238 -0
  88. sglang/srt/models/mixtral.py +3 -3
  89. sglang/srt/models/mixtral_quant.py +3 -3
  90. sglang/srt/models/mllama.py +2 -2
  91. sglang/srt/models/olmo.py +3 -3
  92. sglang/srt/models/olmo2.py +4 -4
  93. sglang/srt/models/olmoe.py +7 -13
  94. sglang/srt/models/phi3_small.py +2 -2
  95. sglang/srt/models/qwen.py +2 -2
  96. sglang/srt/models/qwen2.py +41 -4
  97. sglang/srt/models/qwen2_moe.py +3 -3
  98. sglang/srt/models/qwen2_vl.py +22 -122
  99. sglang/srt/models/stablelm.py +2 -2
  100. sglang/srt/models/torch_native_llama.py +3 -3
  101. sglang/srt/models/xverse.py +6 -6
  102. sglang/srt/models/xverse_moe.py +6 -6
  103. sglang/srt/openai_api/protocol.py +2 -0
  104. sglang/srt/sampling/custom_logit_processor.py +38 -0
  105. sglang/srt/sampling/sampling_batch_info.py +139 -4
  106. sglang/srt/sampling/sampling_params.py +3 -1
  107. sglang/srt/server.py +4 -1090
  108. sglang/srt/server_args.py +57 -14
  109. sglang/srt/utils.py +103 -65
  110. sglang/test/runners.py +8 -13
  111. sglang/test/test_programs.py +1 -1
  112. sglang/test/test_utils.py +3 -1
  113. sglang/utils.py +12 -2
  114. sglang/version.py +1 -1
  115. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/METADATA +16 -5
  116. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/RECORD +119 -115
  117. sglang/launch_server_llavavid.py +0 -25
  118. sglang/srt/constrained/__init__.py +0 -16
  119. sglang/srt/distributed/device_communicators/__init__.py +0 -0
  120. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/LICENSE +0 -0
  121. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/WHEEL +0 -0
  122. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/top_level.txt +0 -0
@@ -5,14 +5,15 @@ from enum import Enum
5
5
  from typing import Callable, List, Optional, Tuple
6
6
 
7
7
  import torch
8
- from vllm.distributed import (
8
+ from vllm.model_executor.custom_op import CustomOp
9
+
10
+ from sglang.srt.distributed import (
9
11
  get_tensor_model_parallel_rank,
10
12
  get_tensor_model_parallel_world_size,
11
13
  tensor_model_parallel_all_reduce,
12
14
  )
13
- from vllm.model_executor.custom_op import CustomOp
14
-
15
15
  from sglang.srt.layers.custom_op_util import register_custom_op
16
+ from sglang.srt.layers.moe.fused_moe_native import moe_forward_native
16
17
  from sglang.srt.layers.moe.topk import select_experts
17
18
  from sglang.srt.layers.quantization.base_config import (
18
19
  QuantizationConfig,
@@ -185,8 +186,31 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
185
186
  inplace=True,
186
187
  )
187
188
 
188
- def forward_cpu(self, *args, **kwargs):
189
- raise NotImplementedError("The CPU backend currently does not support MoE.")
189
+ def forward_cpu(
190
+ self,
191
+ layer: torch.nn.Module,
192
+ x: torch.Tensor,
193
+ use_grouped_topk: bool,
194
+ top_k: int,
195
+ router_logits: torch.Tensor,
196
+ renormalize: bool,
197
+ topk_group: Optional[int] = None,
198
+ num_expert_group: Optional[int] = None,
199
+ custom_routing_function: Optional[Callable] = None,
200
+ correction_bias: Optional[torch.Tensor] = None,
201
+ ) -> torch.Tensor:
202
+ return moe_forward_native(
203
+ layer,
204
+ x,
205
+ use_grouped_topk,
206
+ top_k,
207
+ router_logits,
208
+ renormalize,
209
+ topk_group,
210
+ num_expert_group,
211
+ custom_routing_function,
212
+ correction_bias,
213
+ )
190
214
 
191
215
  def forward_tpu(self, *args, **kwargs) -> torch.Tensor:
192
216
  raise NotImplementedError("The TPU backend currently does not support MoE.")
@@ -6,7 +6,8 @@ from typing import Callable, Optional, Union
6
6
 
7
7
  import torch
8
8
  from torch.nn import Parameter
9
- from vllm.distributed import get_tensor_model_parallel_rank
9
+
10
+ from sglang.srt.distributed import get_tensor_model_parallel_rank
10
11
 
11
12
  __all__ = [
12
13
  "BasevLLMParameter",
@@ -56,33 +56,13 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
56
56
  return QUANTIZATION_METHODS[quantization]
57
57
 
58
58
 
59
- def fp8_get_quant_method(self, layer, prefix):
60
- """Enhanced get_quant_method for FP8 config."""
61
- from vllm.model_executor.layers.linear import LinearBase
62
- from vllm.model_executor.layers.quantization.utils.quant_utils import (
63
- is_layer_skipped,
64
- )
65
-
66
- from sglang.srt.layers.linear import UnquantizedLinearMethod
67
- from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
68
- from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod, Fp8MoEMethod
69
-
70
- if isinstance(layer, LinearBase):
71
- if is_layer_skipped(prefix, self.ignored_layers):
72
- return UnquantizedLinearMethod()
73
- return Fp8LinearMethod(self)
74
- elif isinstance(layer, FusedMoE):
75
- return Fp8MoEMethod(self)
76
- return None
77
-
78
-
79
59
  def gptq_get_quant_method(self, layer, prefix):
80
- from vllm.model_executor.layers.linear import LinearBase
81
60
  from vllm.model_executor.layers.quantization.gptq_marlin import (
82
61
  GPTQMarlinLinearMethod,
83
62
  GPTQMarlinMoEMethod,
84
63
  )
85
64
 
65
+ from sglang.srt.layers.linear import LinearBase
86
66
  from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
87
67
 
88
68
  if isinstance(layer, LinearBase):
@@ -93,12 +73,12 @@ def gptq_get_quant_method(self, layer, prefix):
93
73
 
94
74
 
95
75
  def awq_get_quant_method(self, layer, prefix):
96
- from vllm.model_executor.layers.linear import LinearBase
97
76
  from vllm.model_executor.layers.quantization.awq_marlin import (
98
77
  AWQMarlinLinearMethod,
99
78
  AWQMoEMethod,
100
79
  )
101
80
 
81
+ from sglang.srt.layers.linear import LinearBase
102
82
  from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
103
83
 
104
84
  if isinstance(layer, LinearBase):
@@ -108,13 +88,30 @@ def awq_get_quant_method(self, layer, prefix):
108
88
  return None
109
89
 
110
90
 
91
+ def patch_vllm_linear_base_isinstance():
92
+ import builtins
93
+
94
+ from vllm.model_executor.layers.linear import LinearBase
95
+
96
+ from sglang.srt.layers.linear import LinearBase as PatchedLinearBase
97
+
98
+ original_isinstance = builtins.isinstance
99
+
100
+ def patched_isinstance(obj, classinfo):
101
+ if classinfo is LinearBase:
102
+ return original_isinstance(obj, PatchedLinearBase)
103
+ return original_isinstance(obj, classinfo)
104
+
105
+ builtins.isinstance = patched_isinstance
106
+
107
+
111
108
  def apply_monkey_patches():
112
109
  """Apply all monkey patches in one place."""
113
- setattr(Fp8Config, "get_quant_method", fp8_get_quant_method)
114
110
  setattr(GPTQMarlinConfig, "get_quant_method", gptq_get_quant_method)
115
111
  setattr(AWQMarlinConfig, "get_quant_method", awq_get_quant_method)
116
112
 
117
113
 
114
+ patch_vllm_linear_base_isinstance()
118
115
  # Apply patches when module is imported
119
116
  apply_monkey_patches()
120
117
 
@@ -8,8 +8,6 @@ import torch.nn.functional as F
8
8
  from torch.nn import Module
9
9
  from torch.nn.parameter import Parameter
10
10
  from vllm import _custom_ops as ops
11
- from vllm.distributed import get_tensor_model_parallel_world_size
12
- from vllm.model_executor.layers.linear import LinearBase
13
11
  from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
14
12
  from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
15
13
  apply_fp8_marlin_linear,
@@ -25,7 +23,12 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
25
23
  requantize_with_max_scale,
26
24
  )
27
25
 
28
- from sglang.srt.layers.linear import LinearMethodBase, UnquantizedLinearMethod
26
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
27
+ from sglang.srt.layers.linear import (
28
+ LinearBase,
29
+ LinearMethodBase,
30
+ UnquantizedLinearMethod,
31
+ )
29
32
  from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
30
33
  from sglang.srt.layers.quantization.base_config import (
31
34
  QuantizationConfig,
@@ -5,14 +5,13 @@ from typing import Any, Dict, List, Optional
5
5
 
6
6
  import torch
7
7
  from torch.nn.parameter import Parameter
8
- from vllm.model_executor.layers.linear import LinearBase
9
8
  from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
10
9
  apply_fp8_linear,
11
10
  cutlass_fp8_supported,
12
11
  requantize_with_max_scale,
13
12
  )
14
13
 
15
- from sglang.srt.layers.linear import LinearMethodBase
14
+ from sglang.srt.layers.linear import LinearBase, LinearMethodBase
16
15
  from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
17
16
  from sglang.srt.layers.quantization.base_config import (
18
17
  QuantizationConfig,
@@ -54,7 +54,7 @@ class W8A8Int8Config(QuantizationConfig):
54
54
  layer: torch.nn.Module,
55
55
  prefix: str,
56
56
  ) -> Optional["QuantizeMethodBase"]:
57
- from vllm.model_executor.layers.linear import LinearBase
57
+ from sglang.srt.layers.linear import LinearBase
58
58
 
59
59
  if isinstance(layer, LinearBase):
60
60
  return W8A8Int8LinearMethod(self)
@@ -47,8 +47,8 @@ class RadixAttention(nn.Module):
47
47
  self.logit_cap = logit_cap
48
48
  self.sliding_window_size = sliding_window_size or -1
49
49
  self.is_cross_attention = is_cross_attention
50
- self.k_scale = 1.0
51
- self.v_scale = 1.0
50
+ self.k_scale = None
51
+ self.v_scale = None
52
52
 
53
53
  def forward(
54
54
  self,