sglang 0.5.4__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. sglang/bench_serving.py +56 -12
  2. sglang/launch_server.py +2 -0
  3. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +101 -4
  4. sglang/srt/compilation/backend.py +1 -1
  5. sglang/srt/configs/model_config.py +5 -5
  6. sglang/srt/distributed/parallel_state.py +0 -7
  7. sglang/srt/entrypoints/engine.py +18 -15
  8. sglang/srt/entrypoints/grpc_server.py +0 -1
  9. sglang/srt/entrypoints/http_server.py +75 -94
  10. sglang/srt/environ.py +16 -2
  11. sglang/srt/eplb/expert_distribution.py +30 -0
  12. sglang/srt/function_call/function_call_parser.py +2 -0
  13. sglang/srt/function_call/minimax_m2.py +367 -0
  14. sglang/srt/layers/activation.py +6 -0
  15. sglang/srt/layers/attention/flashattention_backend.py +12 -2
  16. sglang/srt/layers/attention/flashinfer_backend.py +10 -1
  17. sglang/srt/layers/attention/flashinfer_mla_backend.py +18 -10
  18. sglang/srt/layers/attention/trtllm_mla_backend.py +1 -13
  19. sglang/srt/layers/attention/utils.py +78 -0
  20. sglang/srt/layers/communicator.py +1 -0
  21. sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
  22. sglang/srt/layers/layernorm.py +19 -4
  23. sglang/srt/layers/logits_processor.py +5 -0
  24. sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
  25. sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
  26. sglang/srt/layers/moe/ep_moe/layer.py +79 -272
  27. sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
  28. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
  29. sglang/srt/layers/moe/moe_runner/deep_gemm.py +287 -22
  30. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  31. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  32. sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
  33. sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
  34. sglang/srt/layers/moe/token_dispatcher/deepep.py +18 -14
  35. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  36. sglang/srt/layers/moe/topk.py +4 -4
  37. sglang/srt/layers/moe/utils.py +3 -4
  38. sglang/srt/layers/quantization/__init__.py +3 -5
  39. sglang/srt/layers/quantization/awq.py +0 -3
  40. sglang/srt/layers/quantization/base_config.py +7 -0
  41. sglang/srt/layers/quantization/fp8.py +68 -63
  42. sglang/srt/layers/quantization/gguf.py +566 -0
  43. sglang/srt/layers/quantization/mxfp4.py +30 -38
  44. sglang/srt/layers/quantization/unquant.py +23 -45
  45. sglang/srt/layers/quantization/w4afp8.py +38 -2
  46. sglang/srt/layers/radix_attention.py +5 -2
  47. sglang/srt/layers/rotary_embedding.py +13 -1
  48. sglang/srt/layers/sampler.py +12 -1
  49. sglang/srt/managers/io_struct.py +3 -0
  50. sglang/srt/managers/multi_tokenizer_mixin.py +17 -1
  51. sglang/srt/managers/scheduler.py +21 -15
  52. sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
  53. sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
  54. sglang/srt/managers/tokenizer_manager.py +11 -19
  55. sglang/srt/mem_cache/hicache_storage.py +7 -1
  56. sglang/srt/mem_cache/memory_pool.py +82 -0
  57. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  58. sglang/srt/model_executor/forward_batch_info.py +44 -3
  59. sglang/srt/model_executor/model_runner.py +1 -149
  60. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
  61. sglang/srt/models/deepseek_v2.py +147 -44
  62. sglang/srt/models/glm4_moe.py +322 -354
  63. sglang/srt/models/glm4_moe_nextn.py +4 -14
  64. sglang/srt/models/glm4v_moe.py +29 -196
  65. sglang/srt/models/minimax_m2.py +922 -0
  66. sglang/srt/models/nvila.py +355 -0
  67. sglang/srt/models/nvila_lite.py +184 -0
  68. sglang/srt/models/qwen2.py +22 -1
  69. sglang/srt/models/qwen3.py +34 -4
  70. sglang/srt/models/qwen3_moe.py +2 -4
  71. sglang/srt/multimodal/processors/base_processor.py +1 -0
  72. sglang/srt/multimodal/processors/glm4v.py +1 -1
  73. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  74. sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
  75. sglang/srt/parser/reasoning_parser.py +28 -1
  76. sglang/srt/server_args.py +365 -186
  77. sglang/srt/single_batch_overlap.py +2 -7
  78. sglang/srt/utils/common.py +87 -42
  79. sglang/srt/utils/hf_transformers_utils.py +7 -3
  80. sglang/test/test_deterministic.py +235 -12
  81. sglang/test/test_deterministic_utils.py +2 -1
  82. sglang/version.py +1 -1
  83. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +7 -6
  84. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +87 -82
  85. sglang/srt/models/vila.py +0 -306
  86. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  87. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  88. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
sglang/srt/models/vila.py DELETED
@@ -1,306 +0,0 @@
1
- import logging
2
- from typing import Any, Dict, Iterable, List, Optional, Tuple, cast
3
-
4
- import torch
5
- import torch.nn as nn
6
- import torch.nn.functional as F
7
- from torch import Tensor
8
- from transformers.configuration_utils import PretrainedConfig
9
- from transformers.modeling_outputs import BaseModelOutputWithPooling
10
- from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
11
- from transformers.models.siglip import SiglipVisionConfig, SiglipVisionModel
12
-
13
- import sglang.srt.managers.mm_utils as mm_utils
14
- import sglang.srt.model_loader.weight_utils as weight_utils
15
- import sglang.srt.utils as utils
16
- from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
17
- from sglang.srt.layers.pooler import Pooler, PoolingType
18
- from sglang.srt.layers.quantization.base_config import QuantizationConfig
19
- from sglang.srt.managers.mm_utils import MultiModalityDataPaddingPatternMultimodalTokens
20
- from sglang.srt.managers.schedule_batch import (
21
- Modality,
22
- MultimodalDataItem,
23
- MultimodalInputs,
24
- )
25
- from sglang.srt.model_executor.forward_batch_info import ForwardBatch
26
- from sglang.srt.models.qwen2 import Qwen2ForCausalLM
27
-
28
- logger = logging.getLogger(__name__)
29
-
30
-
31
- ##### BEGIN COPY configuration.py #####
32
-
33
-
34
- class VILAConfig(PretrainedConfig):
35
- # Class attributes.
36
- model_type: str = "vila"
37
- sub_configs: Dict[str, PretrainedConfig] = {
38
- "text_config": Qwen2Config(),
39
- "vision_config": SiglipVisionConfig(),
40
- }
41
- _auto_class: Optional[str] = "AutoConfig"
42
-
43
- # Configuration for sub-modules.
44
- text_config: Qwen2Config = Qwen2Config()
45
- vision_config: SiglipVisionConfig = SiglipVisionConfig()
46
-
47
- # Model configuration.
48
- hidden_size: int
49
- image_token_id: int
50
- mm_hidden_size: int
51
- mm_projector_type: str
52
- mm_vision_select_feature: str
53
- mm_vision_select_layer: int
54
- video_token_id: int
55
-
56
- def __init__(
57
- self,
58
- text_config: Optional[Dict[str, Any]] = None,
59
- vision_config: Optional[Dict[str, Any]] = None,
60
- *,
61
- hidden_size: int = 1536,
62
- image_token_id: int = 151649,
63
- mm_hidden_size: int = 1152,
64
- mm_projector_type: str = "mlp_downsample_3x3_fix",
65
- mm_vision_select_feature: str = "cls_patch",
66
- mm_vision_select_layer: int = -2,
67
- video_token_id: int = 151650,
68
- **kwargs,
69
- ):
70
- super().__init__(**kwargs)
71
-
72
- self.text_config = Qwen2Config(**text_config) if text_config else Qwen2Config()
73
- self.vision_config = (
74
- SiglipVisionConfig(**vision_config)
75
- if vision_config
76
- else SiglipVisionConfig()
77
- )
78
-
79
- self.hidden_size = hidden_size
80
- self.image_token_id = image_token_id
81
- self.mm_hidden_size = mm_hidden_size
82
- self.mm_projector_type = mm_projector_type
83
- self.mm_vision_select_feature = mm_vision_select_feature
84
- self.mm_vision_select_layer = mm_vision_select_layer
85
- self.video_token_id = video_token_id
86
-
87
-
88
- ##### END COPY configuration.py #####
89
-
90
- ##### BEGIN COPY modeling_vila.py #####
91
-
92
-
93
- class DownSample3x3BlockFix(nn.Module):
94
- def forward(self, x: Tensor) -> Tensor:
95
- """
96
- Args:
97
- x: The input tensor of shape (batch_size, sequence_length, mm_hidden_size).
98
-
99
- Returns:
100
- The output tensor of shape (batch_size, image_pad_len, mm_hidden_size * 9).
101
- """
102
-
103
- batch_size, sequence_length, hidden_size = x.shape
104
-
105
- feat_size = int(sequence_length**0.5)
106
- if feat_size**2 != sequence_length:
107
- raise ValueError(
108
- f"Cannot take square root: sequence_length {sequence_length} is not a perfect square"
109
- )
110
-
111
- features = x.reshape(batch_size, feat_size, feat_size, hidden_size)
112
-
113
- pad_after = (3 - feat_size % 3) % 3
114
- if pad_after > 0:
115
- features = F.pad(features, (0, 0, 0, pad_after, 0, pad_after))
116
- feat_size = feat_size + pad_after
117
-
118
- features = features.reshape(
119
- batch_size, feat_size // 3, 3, feat_size // 3, 3, hidden_size
120
- )
121
- features = features.permute(0, 1, 3, 2, 4, 5).contiguous()
122
- features = features.reshape(batch_size, -1, 9 * hidden_size)
123
-
124
- return features
125
-
126
-
127
- class MultimodalProjector(nn.Module):
128
- layers: nn.Sequential
129
-
130
- def __init__(
131
- self,
132
- config: VILAConfig,
133
- *args,
134
- **kwargs,
135
- ):
136
- super().__init__(*args, **kwargs)
137
-
138
- if config.mm_projector_type == "mlp_downsample_3x3_fix":
139
- self.layers = nn.Sequential(
140
- DownSample3x3BlockFix(),
141
- nn.LayerNorm(config.mm_hidden_size * 9),
142
- nn.Linear(
143
- config.mm_hidden_size * 9,
144
- config.mm_hidden_size * 3,
145
- ),
146
- nn.GELU(),
147
- nn.LayerNorm(config.vision_config.hidden_size * 3),
148
- nn.Linear(config.vision_config.hidden_size * 3, config.hidden_size),
149
- nn.GELU(),
150
- nn.Linear(config.hidden_size, config.hidden_size),
151
- )
152
- else:
153
- raise NotImplementedError(
154
- f"Unsupported mm_projector_type: {config.mm_projector_type}"
155
- )
156
-
157
- self.layers.type(config.torch_dtype)
158
-
159
- @property
160
- def device(self) -> torch.device:
161
- return next(self.parameters()).device
162
-
163
- @property
164
- def dtype(self) -> torch.dtype:
165
- return next(self.parameters()).dtype
166
-
167
- def forward(self, x: Tensor) -> Tensor:
168
- """
169
- Args:
170
- x: The input tensor of shape (batch_size, sequence_length, mm_hidden_size).
171
-
172
- Returns:
173
- The output tensor of shape (batch_size, image_pad_len, hidden_size).
174
- """
175
-
176
- return self.layers(x.to(device=self.device, dtype=self.dtype))
177
-
178
-
179
- ##### END COPY modeling_vila.py #####
180
-
181
-
182
- class VILAForConditionalGeneration(nn.Module):
183
- config: VILAConfig
184
- quant_config: Optional[QuantizationConfig]
185
-
186
- logits_processor: LogitsProcessor
187
- pooler: Pooler
188
-
189
- llm: Qwen2ForCausalLM
190
- mm_projector: MultimodalProjector
191
- vision_tower: SiglipVisionModel
192
-
193
- def __init__(
194
- self,
195
- config: VILAConfig,
196
- quant_config: Optional[QuantizationConfig] = None,
197
- prefix: str = "",
198
- ) -> None:
199
- super().__init__()
200
-
201
- self.config = config
202
- self.quant_config = quant_config
203
-
204
- self.logits_processor = LogitsProcessor(config)
205
- self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
206
-
207
- self.llm = Qwen2ForCausalLM(
208
- config=config.text_config,
209
- quant_config=quant_config,
210
- prefix=utils.add_prefix("llm", prefix),
211
- )
212
- self.mm_projector = MultimodalProjector(config)
213
- self.vision_tower = SiglipVisionModel(config.vision_config)
214
-
215
- @property
216
- def dtype(self) -> torch.dtype:
217
- return self.config.torch_dtype
218
-
219
- def forward(
220
- self,
221
- input_ids: Tensor,
222
- positions: Tensor,
223
- forward_batch: ForwardBatch,
224
- get_embedding: bool = False,
225
- ) -> LogitsProcessorOutput:
226
- output = mm_utils.general_mm_embed_routine(
227
- input_ids=input_ids,
228
- forward_batch=forward_batch,
229
- language_model=self.llm,
230
- data_embedding_funcs={
231
- Modality.IMAGE: self.get_image_feature,
232
- },
233
- get_embedding=get_embedding,
234
- positions=positions,
235
- )
236
-
237
- return cast(LogitsProcessorOutput, output)
238
-
239
- def get_image_feature(self, mm_input: List[MultimodalDataItem]) -> Tensor:
240
- pixel_values = cast(Tensor, mm_input[0].feature)
241
-
242
- ##### BEGIN COPY modeling_vila.py #####
243
-
244
- vision_tower_output: BaseModelOutputWithPooling = self.vision_tower.__call__(
245
- pixel_values.to(
246
- device=self.vision_tower.device, dtype=self.vision_tower.dtype
247
- ),
248
- output_hidden_states=True,
249
- )
250
-
251
- mm_projector_input = self._vision_tower_output_to_mm_projector_input(
252
- vision_tower_output
253
- )
254
-
255
- image_embedding: Tensor = self.mm_projector.__call__(
256
- mm_projector_input.to(
257
- device=self.mm_projector.device, dtype=self.mm_projector.dtype
258
- )
259
- )
260
-
261
- ##### END COPY modeling_vila.py #####
262
-
263
- return image_embedding
264
-
265
- def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> None:
266
- params_dict = dict(self.named_parameters())
267
-
268
- for name, loaded_weight in weights:
269
- if name.startswith("llm."):
270
- self.llm.load_weights([(name[len("llm.") :], loaded_weight)])
271
- else:
272
- param = params_dict[name]
273
- weight_loader = getattr(
274
- param, "weight_loader", weight_utils.default_weight_loader
275
- )
276
- weight_loader(param, loaded_weight)
277
-
278
- def pad_input_ids(
279
- self, input_ids: List[int], mm_inputs: MultimodalInputs
280
- ) -> List[int]:
281
- pattern = MultiModalityDataPaddingPatternMultimodalTokens()
282
- return pattern.pad_input_tokens(input_ids, mm_inputs)
283
-
284
- ##### BEGIN COPY modeling_vila.py #####
285
-
286
- def _vision_tower_output_to_mm_projector_input(
287
- self,
288
- vision_tower_output: BaseModelOutputWithPooling,
289
- ) -> Tensor:
290
- assert vision_tower_output.hidden_states is not None
291
-
292
- selected_layer_hidden_states = vision_tower_output.hidden_states[
293
- self.config.mm_vision_select_layer
294
- ]
295
-
296
- if self.config.mm_vision_select_feature == "cls_patch":
297
- return selected_layer_hidden_states
298
- else:
299
- raise NotImplementedError(
300
- f"Unsupported mm_vision_select_feature: {self.config.mm_vision_select_feature}"
301
- )
302
-
303
- ##### END COPY modeling_vila.py #####
304
-
305
-
306
- EntryClass = [VILAForConditionalGeneration]