sglang 0.3.4.post1__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. sglang/api.py +1 -1
  2. sglang/bench_latency.py +3 -3
  3. sglang/bench_server_latency.py +2 -3
  4. sglang/bench_serving.py +92 -0
  5. sglang/global_config.py +9 -3
  6. sglang/lang/chat_template.py +50 -25
  7. sglang/lang/interpreter.py +9 -1
  8. sglang/lang/ir.py +11 -2
  9. sglang/launch_server.py +1 -1
  10. sglang/srt/configs/model_config.py +76 -15
  11. sglang/srt/constrained/__init__.py +18 -0
  12. sglang/srt/constrained/bnf_cache.py +61 -0
  13. sglang/srt/constrained/fsm_cache.py +10 -3
  14. sglang/srt/constrained/grammar.py +190 -0
  15. sglang/srt/hf_transformers_utils.py +20 -5
  16. sglang/srt/layers/attention/flashinfer_backend.py +5 -5
  17. sglang/srt/layers/attention/triton_ops/decode_attention.py +110 -30
  18. sglang/srt/layers/attention/triton_ops/prefill_attention.py +1 -1
  19. sglang/srt/layers/fused_moe/fused_moe.py +4 -3
  20. sglang/srt/layers/fused_moe/layer.py +28 -0
  21. sglang/srt/layers/logits_processor.py +5 -5
  22. sglang/srt/layers/quantization/base_config.py +16 -1
  23. sglang/srt/layers/rotary_embedding.py +15 -48
  24. sglang/srt/layers/sampler.py +51 -39
  25. sglang/srt/layers/vocab_parallel_embedding.py +486 -0
  26. sglang/srt/managers/data_parallel_controller.py +8 -7
  27. sglang/srt/managers/detokenizer_manager.py +11 -9
  28. sglang/srt/managers/image_processor.py +4 -3
  29. sglang/srt/managers/io_struct.py +80 -78
  30. sglang/srt/managers/schedule_batch.py +46 -52
  31. sglang/srt/managers/schedule_policy.py +24 -13
  32. sglang/srt/managers/scheduler.py +145 -82
  33. sglang/srt/managers/tokenizer_manager.py +236 -334
  34. sglang/srt/managers/tp_worker.py +5 -5
  35. sglang/srt/managers/tp_worker_overlap_thread.py +58 -21
  36. sglang/srt/mem_cache/flush_cache.py +1 -1
  37. sglang/srt/mem_cache/memory_pool.py +10 -3
  38. sglang/srt/model_executor/cuda_graph_runner.py +34 -23
  39. sglang/srt/model_executor/forward_batch_info.py +6 -9
  40. sglang/srt/model_executor/model_runner.py +10 -19
  41. sglang/srt/models/baichuan.py +4 -4
  42. sglang/srt/models/chatglm.py +4 -4
  43. sglang/srt/models/commandr.py +1 -1
  44. sglang/srt/models/dbrx.py +5 -5
  45. sglang/srt/models/deepseek.py +4 -4
  46. sglang/srt/models/deepseek_v2.py +4 -4
  47. sglang/srt/models/exaone.py +4 -4
  48. sglang/srt/models/gemma.py +1 -1
  49. sglang/srt/models/gemma2.py +1 -1
  50. sglang/srt/models/gpt2.py +287 -0
  51. sglang/srt/models/gpt_bigcode.py +1 -1
  52. sglang/srt/models/grok.py +4 -4
  53. sglang/srt/models/internlm2.py +4 -4
  54. sglang/srt/models/llama.py +15 -7
  55. sglang/srt/models/llama_embedding.py +2 -10
  56. sglang/srt/models/llama_reward.py +5 -0
  57. sglang/srt/models/minicpm.py +4 -4
  58. sglang/srt/models/minicpm3.py +4 -4
  59. sglang/srt/models/mixtral.py +7 -5
  60. sglang/srt/models/mixtral_quant.py +4 -4
  61. sglang/srt/models/mllama.py +5 -5
  62. sglang/srt/models/olmo.py +4 -4
  63. sglang/srt/models/olmoe.py +4 -4
  64. sglang/srt/models/qwen.py +4 -4
  65. sglang/srt/models/qwen2.py +4 -4
  66. sglang/srt/models/qwen2_moe.py +4 -4
  67. sglang/srt/models/qwen2_vl.py +4 -8
  68. sglang/srt/models/stablelm.py +4 -4
  69. sglang/srt/models/torch_native_llama.py +4 -4
  70. sglang/srt/models/xverse.py +4 -4
  71. sglang/srt/models/xverse_moe.py +4 -4
  72. sglang/srt/openai_api/adapter.py +52 -66
  73. sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
  74. sglang/srt/sampling/sampling_batch_info.py +7 -13
  75. sglang/srt/sampling/sampling_params.py +5 -7
  76. sglang/srt/server.py +41 -33
  77. sglang/srt/server_args.py +34 -5
  78. sglang/srt/utils.py +40 -56
  79. sglang/test/run_eval.py +2 -0
  80. sglang/test/runners.py +2 -1
  81. sglang/test/srt/sampling/penaltylib/utils.py +1 -0
  82. sglang/test/test_utils.py +151 -6
  83. sglang/utils.py +62 -1
  84. sglang/version.py +1 -1
  85. sglang-0.3.5.dist-info/METADATA +344 -0
  86. sglang-0.3.5.dist-info/RECORD +152 -0
  87. {sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/WHEEL +1 -1
  88. sglang-0.3.4.post1.dist-info/METADATA +0 -900
  89. sglang-0.3.4.post1.dist-info/RECORD +0 -148
  90. {sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/LICENSE +0 -0
  91. {sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,486 @@
1
+ # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/model_executor/layers/vocab_parallel_embedding.py
2
+
3
+ from dataclasses import dataclass
4
+ from typing import List, Optional, Sequence, Tuple
5
+
6
+ import torch
7
+ import torch.nn.functional as F
8
+ from torch.nn.parameter import Parameter, UninitializedParameter
9
+ from vllm.distributed import (
10
+ divide,
11
+ get_tensor_model_parallel_rank,
12
+ get_tensor_model_parallel_world_size,
13
+ tensor_model_parallel_all_reduce,
14
+ )
15
+ from vllm.model_executor.parameter import BasevLLMParameter
16
+
17
+ from sglang.srt.layers.quantization.base_config import (
18
+ QuantizationConfig,
19
+ QuantizeMethodBase,
20
+ method_has_implemented_embedding,
21
+ )
22
+ from sglang.srt.utils import set_weight_attrs
23
+
24
+ DEFAULT_VOCAB_PADDING_SIZE = 64
25
+
26
+
27
+ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
28
+ """Unquantized method for embeddings."""
29
+
30
+ def create_weights(self, layer: torch.nn.Module,
31
+ input_size_per_partition: int,
32
+ output_partition_sizes: List[int], input_size: int,
33
+ output_size: int, params_dtype: torch.dtype,
34
+ **extra_weight_attrs):
35
+ """Create weights for embedding layer."""
36
+ weight = Parameter(torch.empty(sum(output_partition_sizes),
37
+ input_size_per_partition,
38
+ dtype=params_dtype),
39
+ requires_grad=False)
40
+ set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
41
+ layer.register_parameter("weight", weight)
42
+ set_weight_attrs(weight, extra_weight_attrs)
43
+
44
+ def apply(self,
45
+ layer: torch.nn.Module,
46
+ x: torch.Tensor,
47
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
48
+ return F.linear(x, layer.weight, bias)
49
+
50
+ def embedding(self, layer: torch.nn.Module,
51
+ input_: torch.Tensor) -> torch.Tensor:
52
+ return F.embedding(input_, layer.weight)
53
+
54
+
55
+ def pad_vocab_size(vocab_size: int,
56
+ pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int:
57
+ """Pad the vocab size to the given value."""
58
+ return ((vocab_size + pad_to - 1) // pad_to) * pad_to
59
+
60
+
61
+ def vocab_range_from_per_partition_vocab_size(
62
+ per_partition_vocab_size: int,
63
+ rank: int,
64
+ offset: int = 0) -> Sequence[int]:
65
+ index_f = rank * per_partition_vocab_size
66
+ index_l = index_f + per_partition_vocab_size
67
+ return index_f + offset, index_l + offset
68
+
69
+
70
+ def vocab_range_from_global_vocab_size(global_vocab_size: int,
71
+ rank: int,
72
+ world_size: int,
73
+ offset: int = 0) -> Sequence[int]:
74
+ per_partition_vocab_size = divide(global_vocab_size, world_size)
75
+ return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
76
+ rank,
77
+ offset=offset)
78
+
79
+
80
+ @dataclass
81
+ class VocabParallelEmbeddingShardIndices:
82
+ """Indices for a shard of a vocab parallel embedding."""
83
+ padded_org_vocab_start_index: int
84
+ padded_org_vocab_end_index: int
85
+ padded_added_vocab_start_index: int
86
+ padded_added_vocab_end_index: int
87
+
88
+ org_vocab_start_index: int
89
+ org_vocab_end_index: int
90
+ added_vocab_start_index: int
91
+ added_vocab_end_index: int
92
+
93
+ @property
94
+ def num_org_elements(self) -> int:
95
+ return self.org_vocab_end_index - self.org_vocab_start_index
96
+
97
+ @property
98
+ def num_added_elements(self) -> int:
99
+ return self.added_vocab_end_index - self.added_vocab_start_index
100
+
101
+ @property
102
+ def num_org_elements_padded(self) -> int:
103
+ return (self.padded_org_vocab_end_index -
104
+ self.padded_org_vocab_start_index)
105
+
106
+ @property
107
+ def num_added_elements_padded(self) -> int:
108
+ return (self.padded_added_vocab_end_index -
109
+ self.padded_added_vocab_start_index)
110
+
111
+ @property
112
+ def num_org_vocab_padding(self) -> int:
113
+ return self.num_org_elements_padded - self.num_org_elements
114
+
115
+ @property
116
+ def num_added_vocab_padding(self) -> int:
117
+ return self.num_added_elements_padded - self.num_added_elements
118
+
119
+ @property
120
+ def num_elements_padded(self) -> int:
121
+ return self.num_org_elements_padded + self.num_added_elements_padded
122
+
123
+ def __post_init__(self):
124
+ # sanity checks
125
+ assert (self.padded_org_vocab_start_index <=
126
+ self.padded_org_vocab_end_index)
127
+ assert (self.padded_added_vocab_start_index <=
128
+ self.padded_added_vocab_end_index)
129
+
130
+ assert self.org_vocab_start_index <= self.org_vocab_end_index
131
+ assert self.added_vocab_start_index <= self.added_vocab_end_index
132
+
133
+ assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
134
+ assert (self.added_vocab_start_index <=
135
+ self.padded_added_vocab_start_index)
136
+ assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
137
+ assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
138
+
139
+ assert self.num_org_elements <= self.num_org_elements_padded
140
+ assert self.num_added_elements <= self.num_added_elements_padded
141
+
142
+
143
+ @torch.jit.script
144
+ def get_masked_input_and_mask(
145
+ input_: torch.Tensor, org_vocab_start_index: int,
146
+ org_vocab_end_index: int, num_org_vocab_padding: int,
147
+ added_vocab_start_index: int,
148
+ added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
149
+ # torch.jit.script will fuse all of the pointwise ops below
150
+ # into a single kernel, making it very fast
151
+ org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
152
+ org_vocab_end_index)
153
+ added_vocab_mask = (input_ >= added_vocab_start_index) & (
154
+ input_ < added_vocab_end_index)
155
+ added_offset = added_vocab_start_index - (
156
+ org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding
157
+ valid_offset = (org_vocab_start_index *
158
+ org_vocab_mask) + (added_offset * added_vocab_mask)
159
+ vocab_mask = org_vocab_mask | added_vocab_mask
160
+ input_ = vocab_mask * (input_ - valid_offset)
161
+ return input_, ~vocab_mask
162
+
163
+
164
+ class VocabParallelEmbedding(torch.nn.Module):
165
+ """Embedding parallelized in the vocabulary dimension.
166
+
167
+ Adapted from torch.nn.Embedding, note that we pad the vocabulary size to
168
+ make sure it is divisible by the number of model parallel GPUs.
169
+
170
+ In order to support various loading methods, we ensure that LoRA-added
171
+ embeddings are always at the end of TP-sharded tensors. In other words,
172
+ we shard base embeddings and LoRA embeddings separately (both padded),
173
+ and place them in the same tensor.
174
+ In this example, we will have the original vocab size = 1010,
175
+ added vocab size = 16 and padding to 64. Therefore, the total
176
+ vocab size with padding will be 1088 (because we first pad 1010 to
177
+ 1024, add 16, and then pad to 1088).
178
+ Therefore, the tensor format looks like the following:
179
+ TP1, rank 0 (no sharding):
180
+ |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >|
181
+ corresponding token_id: | 0 | 1 | ... | 1009 | -1 | ... | -1 | 1010 | ... | 1015 | -1 | ... | -1 |
182
+ index: | 0 | 1 | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 |
183
+
184
+ TP2, rank 0:
185
+ |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >|
186
+ corresponding token_id: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 1000 | ... | 1015 | -1 | ... | -1 |
187
+ index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 527 | 520 | ... | 543 |
188
+ TP2, rank 1:
189
+ |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
190
+ corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1 | ... | -1 | -1 | ... | -1 | -1 | ... | -1 |
191
+ index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 519 | 520 | ... | 543 |
192
+
193
+ Args:
194
+ num_embeddings: vocabulary size.
195
+ embedding_dim: size of hidden state.
196
+ params_dtype: type of the parameters.
197
+ org_num_embeddings: original vocabulary size (without LoRA).
198
+ padding_size: padding size for the vocabulary.
199
+ quant_config: quant config for the layer
200
+ prefix: full name of the layer in the state dict
201
+ """ # noqa: E501
202
+
203
+ def __init__(self,
204
+ num_embeddings: int,
205
+ embedding_dim: int,
206
+ params_dtype: Optional[torch.dtype] = None,
207
+ org_num_embeddings: Optional[int] = None,
208
+ padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
209
+ quant_config: Optional[QuantizationConfig] = None,
210
+ prefix: str = "",
211
+ enable_tp: bool = True):
212
+ super().__init__()
213
+
214
+ self.enable_tp = enable_tp
215
+ if self.enable_tp:
216
+ tp_rank = get_tensor_model_parallel_rank()
217
+ self.tp_size = get_tensor_model_parallel_world_size()
218
+ else:
219
+ tp_rank = 0
220
+ self.tp_size = 1
221
+
222
+ self.num_embeddings = num_embeddings
223
+ self.padding_size = padding_size
224
+ self.org_vocab_size = org_num_embeddings or num_embeddings
225
+ num_added_embeddings = num_embeddings - self.org_vocab_size
226
+ self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size,
227
+ self.padding_size)
228
+ self.num_embeddings_padded = pad_vocab_size(
229
+ self.org_vocab_size_padded + num_added_embeddings,
230
+ self.padding_size)
231
+ assert self.org_vocab_size_padded <= self.num_embeddings_padded
232
+
233
+ self.shard_indices = self._get_indices(self.num_embeddings_padded,
234
+ self.org_vocab_size_padded,
235
+ self.num_embeddings,
236
+ self.org_vocab_size, tp_rank,
237
+ self.tp_size)
238
+ self.embedding_dim = embedding_dim
239
+
240
+ linear_method = None
241
+ if quant_config is not None:
242
+ linear_method = quant_config.get_quant_method(self, prefix=prefix)
243
+ if linear_method is None:
244
+ linear_method = UnquantizedEmbeddingMethod()
245
+
246
+ # If we are making an embedding layer, then our quantization linear
247
+ # method must implement the embedding operation. If we are another
248
+ # layer type like ParallelLMHead, this is not important.
249
+ is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
250
+ linear_method_implements_embedding = method_has_implemented_embedding(
251
+ type(linear_method))
252
+ if is_embedding_layer and not linear_method_implements_embedding:
253
+ raise NotImplementedError(
254
+ f"The class {type(linear_method).__name__} must implement "
255
+ "the 'embedding' method, see UnquantizedEmbeddingMethod.")
256
+
257
+ self.linear_method: QuantizeMethodBase = linear_method
258
+
259
+ if params_dtype is None:
260
+ params_dtype = torch.get_default_dtype()
261
+ # Divide the weight matrix along the vocaburaly dimension.
262
+ self.num_added_embeddings = self.num_embeddings - self.org_vocab_size
263
+ self.num_embeddings_per_partition = divide(self.num_embeddings_padded,
264
+ self.tp_size)
265
+ assert (self.shard_indices.num_elements_padded ==
266
+ self.num_embeddings_per_partition)
267
+ self.num_org_embeddings_per_partition = (
268
+ self.shard_indices.org_vocab_end_index -
269
+ self.shard_indices.org_vocab_start_index)
270
+ self.num_added_embeddings_per_partition = (
271
+ self.shard_indices.added_vocab_end_index -
272
+ self.shard_indices.added_vocab_start_index)
273
+
274
+ self.linear_method.create_weights(self,
275
+ self.embedding_dim,
276
+ [self.num_embeddings_per_partition],
277
+ self.embedding_dim,
278
+ self.num_embeddings_padded,
279
+ params_dtype=params_dtype,
280
+ weight_loader=self.weight_loader)
281
+
282
+ @classmethod
283
+ def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int,
284
+ vocab_size: int, org_vocab_size: int, tp_rank: int,
285
+ tp_size: int) -> VocabParallelEmbeddingShardIndices:
286
+ """Get start and end indices for vocab parallel embedding, following the
287
+ layout outlined in the class docstring, based on the given tp_rank and
288
+ tp_size."""
289
+ num_added_embeddings_padded = vocab_size_padded - org_vocab_size_padded
290
+ padded_org_vocab_start_index, padded_org_vocab_end_index = (
291
+ vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank,
292
+ tp_size))
293
+ padded_added_vocab_start_index, padded_added_vocab_end_index = (
294
+ vocab_range_from_global_vocab_size(num_added_embeddings_padded,
295
+ tp_rank,
296
+ tp_size,
297
+ offset=org_vocab_size))
298
+ # remove padding
299
+ org_vocab_start_index = min(padded_org_vocab_start_index,
300
+ org_vocab_size)
301
+ org_vocab_end_index = min(padded_org_vocab_end_index, org_vocab_size)
302
+ added_vocab_start_index = min(padded_added_vocab_start_index,
303
+ vocab_size)
304
+ added_vocab_end_index = min(padded_added_vocab_end_index, vocab_size)
305
+ return VocabParallelEmbeddingShardIndices(
306
+ padded_org_vocab_start_index, padded_org_vocab_end_index,
307
+ padded_added_vocab_start_index, padded_added_vocab_end_index,
308
+ org_vocab_start_index, org_vocab_end_index,
309
+ added_vocab_start_index, added_vocab_end_index)
310
+
311
+ def get_sharded_to_full_mapping(self) -> Optional[List[int]]:
312
+ """Get a mapping that can be used to reindex the gathered
313
+ logits for sampling.
314
+
315
+ During sampling, we gather logits from all ranks. The relationship
316
+ of index->token_id will follow the same format as outlined in the class
317
+ docstring. However, after the gather, we want to reindex the final
318
+ logits tensor to map index->token_id one-to-one (the index is always
319
+ equal the token_id it corresponds to). The indices returned by this
320
+ method allow us to do that.
321
+ """
322
+ if self.tp_size < 2:
323
+ return None
324
+
325
+ base_embeddings: List[int] = []
326
+ added_embeddings: List[int] = []
327
+ padding: List[int] = []
328
+ for tp_rank in range(self.tp_size):
329
+ shard_indices = self._get_indices(self.num_embeddings_padded,
330
+ self.org_vocab_size_padded,
331
+ self.num_embeddings,
332
+ self.org_vocab_size, tp_rank,
333
+ self.tp_size)
334
+ range_start = self.num_embeddings_per_partition * tp_rank
335
+ range_end = self.num_embeddings_per_partition * (tp_rank + 1)
336
+ base_embeddings.extend(
337
+ range(range_start,
338
+ range_start + shard_indices.num_org_elements))
339
+ padding.extend(
340
+ range(range_start + shard_indices.num_org_elements,
341
+ range_start + shard_indices.num_org_elements_padded))
342
+ added_embeddings.extend(
343
+ range(
344
+ range_start + shard_indices.num_org_elements_padded,
345
+ range_start + shard_indices.num_org_elements_padded +
346
+ shard_indices.num_added_elements))
347
+ padding.extend(
348
+ range(
349
+ range_start + shard_indices.num_org_elements_padded +
350
+ shard_indices.num_added_elements,
351
+ range_start + shard_indices.num_org_elements_padded +
352
+ shard_indices.num_added_elements_padded))
353
+ assert (range_start + shard_indices.num_org_elements_padded +
354
+ shard_indices.num_added_elements_padded == range_end)
355
+ ret = base_embeddings + added_embeddings + padding
356
+ assert len(ret) == self.num_embeddings_padded
357
+ return ret
358
+
359
+ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
360
+ output_dim = getattr(param, "output_dim", None)
361
+ packed_dim = getattr(param, "packed_dim", None)
362
+
363
+ # If the parameter is a gguf weight, then load it directly.
364
+ if getattr(param, "is_gguf_weight_type", None):
365
+ param.data.copy_(loaded_weight)
366
+ param.weight_type = loaded_weight.item()
367
+ return
368
+ elif isinstance(param, UninitializedParameter):
369
+ shape = list(loaded_weight.shape)
370
+ if output_dim is not None:
371
+ shape[output_dim] = shape[output_dim] // self.tp_size
372
+ param.materialize(tuple(shape), dtype=loaded_weight.dtype)
373
+
374
+ # If parameter does not have output dim, then it should
375
+ # be copied onto all gpus (e.g. g_idx for act_order gptq).
376
+ if output_dim is None:
377
+ assert param.data.shape == loaded_weight.shape
378
+ param.data.copy_(loaded_weight)
379
+ return
380
+
381
+ # Shard indexes for loading the weight
382
+ start_idx = self.shard_indices.org_vocab_start_index
383
+ shard_size = self.shard_indices.org_vocab_end_index - start_idx
384
+
385
+ # If param packed on the same dim we are sharding on, then
386
+ # need to adjust offsets of loaded weight by pack_factor.
387
+ if packed_dim is not None and packed_dim == output_dim:
388
+ packed_factor = param.packed_factor if isinstance(
389
+ param, BasevLLMParameter) else param.pack_factor
390
+ assert loaded_weight.shape[output_dim] == (self.org_vocab_size //
391
+ param.packed_factor)
392
+ start_idx = start_idx // packed_factor
393
+ shard_size = shard_size // packed_factor
394
+ else:
395
+ assert loaded_weight.shape[output_dim] == self.org_vocab_size
396
+
397
+ # Copy the data.
398
+ loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
399
+ param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
400
+ param[loaded_weight.shape[0]:].data.fill_(0)
401
+
402
+ def forward(self, input_):
403
+ if self.tp_size > 1:
404
+ # Build the mask.
405
+ masked_input, input_mask = get_masked_input_and_mask(
406
+ input_, self.shard_indices.org_vocab_start_index,
407
+ self.shard_indices.org_vocab_end_index,
408
+ self.shard_indices.num_org_vocab_padding,
409
+ self.shard_indices.added_vocab_start_index,
410
+ self.shard_indices.added_vocab_end_index)
411
+ else:
412
+ masked_input = input_
413
+ # Get the embeddings.
414
+ output_parallel = self.linear_method.embedding(self,
415
+ masked_input.long())
416
+ # Mask the output embedding.
417
+ if self.tp_size > 1:
418
+ output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
419
+ # Reduce across all the model parallel GPUs.
420
+ output = tensor_model_parallel_all_reduce(output_parallel)
421
+ else:
422
+ output = output_parallel
423
+ return output
424
+
425
+ def extra_repr(self) -> str:
426
+ s = f"num_embeddings={self.num_embeddings_per_partition}"
427
+ s += f", embedding_dim={self.embedding_dim}"
428
+ s += f", org_vocab_size={self.org_vocab_size}"
429
+ s += f', num_embeddings_padded={self.num_embeddings_padded}'
430
+ if self.enable_tp:
431
+ s += f', tp_size={self.tp_size}'
432
+ return s
433
+
434
+
435
+ class ParallelLMHead(VocabParallelEmbedding):
436
+ """Parallelized LM head.
437
+
438
+ Output logits weight matrices used in the Sampler. The weight and bias
439
+ tensors are padded to make sure they are divisible by the number of
440
+ model parallel GPUs.
441
+
442
+ Args:
443
+ num_embeddings: vocabulary size.
444
+ embedding_dim: size of hidden state.
445
+ bias: whether to use bias.
446
+ params_dtype: type of the parameters.
447
+ org_num_embeddings: original vocabulary size (without LoRA).
448
+ padding_size: padding size for the vocabulary.
449
+ """
450
+
451
+ def __init__(self,
452
+ num_embeddings: int,
453
+ embedding_dim: int,
454
+ bias: bool = False,
455
+ params_dtype: Optional[torch.dtype] = None,
456
+ org_num_embeddings: Optional[int] = None,
457
+ padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
458
+ quant_config: Optional[QuantizationConfig] = None,
459
+ prefix: str = ""):
460
+ super().__init__(num_embeddings, embedding_dim, params_dtype,
461
+ org_num_embeddings, padding_size, quant_config,
462
+ prefix)
463
+ self.quant_config = quant_config
464
+ if bias:
465
+ self.bias = Parameter(
466
+ torch.empty(self.num_embeddings_per_partition,
467
+ dtype=params_dtype))
468
+ set_weight_attrs(self.bias, {
469
+ "output_dim": 0,
470
+ "weight_loader": self.weight_loader,
471
+ })
472
+ else:
473
+ self.register_parameter("bias", None)
474
+
475
+ def tie_weights(self, embed_tokens: VocabParallelEmbedding):
476
+ """Tie the weights with word embeddings."""
477
+ # GGUF quantized embed_tokens.
478
+ if self.quant_config and self.quant_config.get_name() == "gguf":
479
+ return embed_tokens
480
+ else:
481
+ self.weight = embed_tokens.weight
482
+ return self
483
+
484
+ def forward(self, input_):
485
+ del input_
486
+ raise RuntimeError("LMHead's weights should be used in the sampler.")
@@ -24,12 +24,12 @@ import zmq
24
24
  from sglang.srt.managers.io_struct import (
25
25
  TokenizedEmbeddingReqInput,
26
26
  TokenizedGenerateReqInput,
27
- TokenizedRewardReqInput,
28
27
  )
29
28
  from sglang.srt.managers.scheduler import run_scheduler_process
30
29
  from sglang.srt.server_args import PortArgs, ServerArgs
31
30
  from sglang.srt.utils import (
32
31
  configure_logger,
32
+ get_zmq_socket,
33
33
  kill_parent_process,
34
34
  suppress_other_loggers,
35
35
  )
@@ -66,8 +66,9 @@ class DataParallelController:
66
66
 
67
67
  # Init inter-process communication
68
68
  self.context = zmq.Context(1 + server_args.dp_size)
69
- self.recv_from_tokenizer = self.context.socket(zmq.PULL)
70
- self.recv_from_tokenizer.bind(f"ipc://{port_args.scheduler_input_ipc_name}")
69
+ self.recv_from_tokenizer = get_zmq_socket(
70
+ self.context, zmq.PULL, port_args.scheduler_input_ipc_name
71
+ )
71
72
 
72
73
  # Dispatch method
73
74
  self.round_robin_counter = 0
@@ -120,8 +121,9 @@ class DataParallelController:
120
121
  scheduler_procs.append(proc)
121
122
  scheduler_pipe_readers.append(reader)
122
123
 
123
- send_to = self.context.socket(zmq.PUSH)
124
- send_to.connect(f"ipc://{port_args.scheduler_input_ipc_name}")
124
+ send_to = get_zmq_socket(
125
+ self.context, zmq.PUSH, port_args.scheduler_input_ipc_name
126
+ )
125
127
 
126
128
  # Wait for model to finish loading
127
129
  for i in range(len(scheduler_pipe_readers)):
@@ -149,14 +151,13 @@ class DataParallelController:
149
151
  (
150
152
  TokenizedGenerateReqInput,
151
153
  TokenizedEmbeddingReqInput,
152
- TokenizedRewardReqInput,
153
154
  ),
154
155
  ):
155
156
  self.dispatching(recv_req)
156
157
  else:
157
158
  # Send other control messages to all workers
158
159
  for worker in self.workers:
159
- worker.queue.put(recv_req)
160
+ worker.send_pyobj(recv_req)
160
161
 
161
162
 
162
163
  def run_data_parallel_controller_process(
@@ -27,11 +27,12 @@ from sglang.srt.managers.io_struct import (
27
27
  BatchEmbeddingOut,
28
28
  BatchStrOut,
29
29
  BatchTokenIDOut,
30
+ GetMemPoolSizeReqOutput,
30
31
  UpdateWeightReqOutput,
31
32
  )
32
33
  from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN
33
34
  from sglang.srt.server_args import PortArgs, ServerArgs
34
- from sglang.srt.utils import configure_logger, kill_parent_process
35
+ from sglang.srt.utils import configure_logger, get_zmq_socket, kill_parent_process
35
36
  from sglang.utils import find_printable_text, get_exception_traceback
36
37
 
37
38
  logger = logging.getLogger(__name__)
@@ -58,11 +59,12 @@ class DetokenizerManager:
58
59
  ):
59
60
  # Init inter-process communication
60
61
  context = zmq.Context(2)
61
- self.recv_from_scheduler = context.socket(zmq.PULL)
62
- self.recv_from_scheduler.bind(f"ipc://{port_args.detokenizer_ipc_name}")
63
-
64
- self.send_to_tokenizer = context.socket(zmq.PUSH)
65
- self.send_to_tokenizer.connect(f"ipc://{port_args.tokenizer_ipc_name}")
62
+ self.recv_from_scheduler = get_zmq_socket(
63
+ context, zmq.PULL, port_args.detokenizer_ipc_name
64
+ )
65
+ self.send_to_tokenizer = get_zmq_socket(
66
+ context, zmq.PUSH, port_args.tokenizer_ipc_name
67
+ )
66
68
 
67
69
  if server_args.skip_tokenizer_init:
68
70
  self.tokenizer = None
@@ -111,12 +113,12 @@ class DetokenizerManager:
111
113
  # If it is a weight update request, no detokenization is needed.
112
114
  self.send_to_tokenizer.send_pyobj(recv_obj)
113
115
  continue
114
- elif self.tokenizer is None:
115
- # If the tokenizer is skipped, no detokenization is needed
116
+ elif isinstance(recv_obj, GetMemPoolSizeReqOutput):
116
117
  self.send_to_tokenizer.send_pyobj(recv_obj)
117
118
  continue
119
+ else:
120
+ assert isinstance(recv_obj, BatchTokenIDOut)
118
121
 
119
- assert isinstance(recv_obj, BatchTokenIDOut)
120
122
  bs = len(recv_obj.rids)
121
123
 
122
124
  # Initialize decode status
@@ -36,11 +36,12 @@ class BaseImageProcessor(ABC):
36
36
  def __init__(self, hf_config, server_args, _processor):
37
37
  self.hf_config = hf_config
38
38
  self._processor = _processor
39
+
39
40
  self.executor = concurrent.futures.ProcessPoolExecutor(
40
41
  initializer=init_global_processor,
41
42
  mp_context=mp.get_context("fork"),
42
43
  initargs=(server_args,),
43
- max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
44
+ max_workers=int(os.environ.get("SGLANG_CPU_COUNT", os.cpu_count())),
44
45
  )
45
46
 
46
47
  @abstractmethod
@@ -179,7 +180,7 @@ class LlavaImageProcessor(BaseImageProcessor):
179
180
  "pixel_values": pixel_values,
180
181
  "image_hashes": image_hashes,
181
182
  "image_sizes": image_sizes,
182
- "modalities": request_obj.modalities,
183
+ "modalities": request_obj.modalities or ["image"],
183
184
  }
184
185
 
185
186
 
@@ -239,7 +240,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
239
240
  initializer=init_global_processor,
240
241
  mp_context=mp.get_context("fork"),
241
242
  initargs=(server_args,),
242
- max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
243
+ max_workers=int(os.environ.get("SGLANG_CPU_COUNT", os.cpu_count())),
243
244
  )
244
245
 
245
246
  @staticmethod