sglang 0.3.4.post2__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +1 -1
- sglang/bench_latency.py +3 -3
- sglang/bench_server_latency.py +2 -3
- sglang/bench_serving.py +92 -0
- sglang/global_config.py +9 -3
- sglang/lang/chat_template.py +50 -25
- sglang/lang/interpreter.py +9 -1
- sglang/lang/ir.py +11 -2
- sglang/launch_server.py +1 -1
- sglang/srt/configs/model_config.py +51 -13
- sglang/srt/constrained/__init__.py +18 -0
- sglang/srt/constrained/bnf_cache.py +61 -0
- sglang/srt/constrained/grammar.py +190 -0
- sglang/srt/hf_transformers_utils.py +6 -5
- sglang/srt/layers/attention/triton_ops/decode_attention.py +110 -30
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +1 -1
- sglang/srt/layers/fused_moe/fused_moe.py +4 -3
- sglang/srt/layers/fused_moe/layer.py +28 -0
- sglang/srt/layers/quantization/base_config.py +16 -1
- sglang/srt/layers/vocab_parallel_embedding.py +486 -0
- sglang/srt/managers/data_parallel_controller.py +7 -6
- sglang/srt/managers/detokenizer_manager.py +9 -11
- sglang/srt/managers/image_processor.py +4 -3
- sglang/srt/managers/io_struct.py +70 -78
- sglang/srt/managers/schedule_batch.py +33 -49
- sglang/srt/managers/schedule_policy.py +24 -13
- sglang/srt/managers/scheduler.py +137 -80
- sglang/srt/managers/tokenizer_manager.py +224 -336
- sglang/srt/managers/tp_worker.py +5 -5
- sglang/srt/mem_cache/flush_cache.py +1 -1
- sglang/srt/model_executor/cuda_graph_runner.py +7 -4
- sglang/srt/model_executor/model_runner.py +8 -17
- sglang/srt/models/baichuan.py +4 -4
- sglang/srt/models/chatglm.py +4 -4
- sglang/srt/models/commandr.py +1 -1
- sglang/srt/models/dbrx.py +5 -5
- sglang/srt/models/deepseek.py +4 -4
- sglang/srt/models/deepseek_v2.py +4 -4
- sglang/srt/models/exaone.py +4 -4
- sglang/srt/models/gemma.py +1 -1
- sglang/srt/models/gemma2.py +1 -1
- sglang/srt/models/gpt2.py +287 -0
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/grok.py +4 -4
- sglang/srt/models/internlm2.py +4 -4
- sglang/srt/models/llama.py +15 -7
- sglang/srt/models/llama_embedding.py +2 -10
- sglang/srt/models/llama_reward.py +5 -0
- sglang/srt/models/minicpm.py +4 -4
- sglang/srt/models/minicpm3.py +4 -4
- sglang/srt/models/mixtral.py +7 -5
- sglang/srt/models/mixtral_quant.py +4 -4
- sglang/srt/models/mllama.py +5 -5
- sglang/srt/models/olmo.py +4 -4
- sglang/srt/models/olmoe.py +4 -4
- sglang/srt/models/qwen.py +4 -4
- sglang/srt/models/qwen2.py +4 -4
- sglang/srt/models/qwen2_moe.py +4 -4
- sglang/srt/models/qwen2_vl.py +4 -8
- sglang/srt/models/stablelm.py +4 -4
- sglang/srt/models/torch_native_llama.py +4 -4
- sglang/srt/models/xverse.py +4 -4
- sglang/srt/models/xverse_moe.py +4 -4
- sglang/srt/openai_api/adapter.py +52 -66
- sglang/srt/sampling/sampling_batch_info.py +7 -13
- sglang/srt/server.py +31 -35
- sglang/srt/server_args.py +34 -5
- sglang/srt/utils.py +40 -56
- sglang/test/runners.py +2 -1
- sglang/test/test_utils.py +73 -25
- sglang/utils.py +62 -1
- sglang/version.py +1 -1
- sglang-0.3.5.dist-info/METADATA +344 -0
- {sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/RECORD +77 -73
- {sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/WHEEL +1 -1
- sglang-0.3.4.post2.dist-info/METADATA +0 -899
- {sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/LICENSE +0 -0
- {sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,486 @@
|
|
1
|
+
# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/model_executor/layers/vocab_parallel_embedding.py
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from typing import List, Optional, Sequence, Tuple
|
5
|
+
|
6
|
+
import torch
|
7
|
+
import torch.nn.functional as F
|
8
|
+
from torch.nn.parameter import Parameter, UninitializedParameter
|
9
|
+
from vllm.distributed import (
|
10
|
+
divide,
|
11
|
+
get_tensor_model_parallel_rank,
|
12
|
+
get_tensor_model_parallel_world_size,
|
13
|
+
tensor_model_parallel_all_reduce,
|
14
|
+
)
|
15
|
+
from vllm.model_executor.parameter import BasevLLMParameter
|
16
|
+
|
17
|
+
from sglang.srt.layers.quantization.base_config import (
|
18
|
+
QuantizationConfig,
|
19
|
+
QuantizeMethodBase,
|
20
|
+
method_has_implemented_embedding,
|
21
|
+
)
|
22
|
+
from sglang.srt.utils import set_weight_attrs
|
23
|
+
|
24
|
+
DEFAULT_VOCAB_PADDING_SIZE = 64
|
25
|
+
|
26
|
+
|
27
|
+
class UnquantizedEmbeddingMethod(QuantizeMethodBase):
|
28
|
+
"""Unquantized method for embeddings."""
|
29
|
+
|
30
|
+
def create_weights(self, layer: torch.nn.Module,
|
31
|
+
input_size_per_partition: int,
|
32
|
+
output_partition_sizes: List[int], input_size: int,
|
33
|
+
output_size: int, params_dtype: torch.dtype,
|
34
|
+
**extra_weight_attrs):
|
35
|
+
"""Create weights for embedding layer."""
|
36
|
+
weight = Parameter(torch.empty(sum(output_partition_sizes),
|
37
|
+
input_size_per_partition,
|
38
|
+
dtype=params_dtype),
|
39
|
+
requires_grad=False)
|
40
|
+
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
|
41
|
+
layer.register_parameter("weight", weight)
|
42
|
+
set_weight_attrs(weight, extra_weight_attrs)
|
43
|
+
|
44
|
+
def apply(self,
|
45
|
+
layer: torch.nn.Module,
|
46
|
+
x: torch.Tensor,
|
47
|
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
48
|
+
return F.linear(x, layer.weight, bias)
|
49
|
+
|
50
|
+
def embedding(self, layer: torch.nn.Module,
|
51
|
+
input_: torch.Tensor) -> torch.Tensor:
|
52
|
+
return F.embedding(input_, layer.weight)
|
53
|
+
|
54
|
+
|
55
|
+
def pad_vocab_size(vocab_size: int,
|
56
|
+
pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int:
|
57
|
+
"""Pad the vocab size to the given value."""
|
58
|
+
return ((vocab_size + pad_to - 1) // pad_to) * pad_to
|
59
|
+
|
60
|
+
|
61
|
+
def vocab_range_from_per_partition_vocab_size(
|
62
|
+
per_partition_vocab_size: int,
|
63
|
+
rank: int,
|
64
|
+
offset: int = 0) -> Sequence[int]:
|
65
|
+
index_f = rank * per_partition_vocab_size
|
66
|
+
index_l = index_f + per_partition_vocab_size
|
67
|
+
return index_f + offset, index_l + offset
|
68
|
+
|
69
|
+
|
70
|
+
def vocab_range_from_global_vocab_size(global_vocab_size: int,
|
71
|
+
rank: int,
|
72
|
+
world_size: int,
|
73
|
+
offset: int = 0) -> Sequence[int]:
|
74
|
+
per_partition_vocab_size = divide(global_vocab_size, world_size)
|
75
|
+
return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
|
76
|
+
rank,
|
77
|
+
offset=offset)
|
78
|
+
|
79
|
+
|
80
|
+
@dataclass
|
81
|
+
class VocabParallelEmbeddingShardIndices:
|
82
|
+
"""Indices for a shard of a vocab parallel embedding."""
|
83
|
+
padded_org_vocab_start_index: int
|
84
|
+
padded_org_vocab_end_index: int
|
85
|
+
padded_added_vocab_start_index: int
|
86
|
+
padded_added_vocab_end_index: int
|
87
|
+
|
88
|
+
org_vocab_start_index: int
|
89
|
+
org_vocab_end_index: int
|
90
|
+
added_vocab_start_index: int
|
91
|
+
added_vocab_end_index: int
|
92
|
+
|
93
|
+
@property
|
94
|
+
def num_org_elements(self) -> int:
|
95
|
+
return self.org_vocab_end_index - self.org_vocab_start_index
|
96
|
+
|
97
|
+
@property
|
98
|
+
def num_added_elements(self) -> int:
|
99
|
+
return self.added_vocab_end_index - self.added_vocab_start_index
|
100
|
+
|
101
|
+
@property
|
102
|
+
def num_org_elements_padded(self) -> int:
|
103
|
+
return (self.padded_org_vocab_end_index -
|
104
|
+
self.padded_org_vocab_start_index)
|
105
|
+
|
106
|
+
@property
|
107
|
+
def num_added_elements_padded(self) -> int:
|
108
|
+
return (self.padded_added_vocab_end_index -
|
109
|
+
self.padded_added_vocab_start_index)
|
110
|
+
|
111
|
+
@property
|
112
|
+
def num_org_vocab_padding(self) -> int:
|
113
|
+
return self.num_org_elements_padded - self.num_org_elements
|
114
|
+
|
115
|
+
@property
|
116
|
+
def num_added_vocab_padding(self) -> int:
|
117
|
+
return self.num_added_elements_padded - self.num_added_elements
|
118
|
+
|
119
|
+
@property
|
120
|
+
def num_elements_padded(self) -> int:
|
121
|
+
return self.num_org_elements_padded + self.num_added_elements_padded
|
122
|
+
|
123
|
+
def __post_init__(self):
|
124
|
+
# sanity checks
|
125
|
+
assert (self.padded_org_vocab_start_index <=
|
126
|
+
self.padded_org_vocab_end_index)
|
127
|
+
assert (self.padded_added_vocab_start_index <=
|
128
|
+
self.padded_added_vocab_end_index)
|
129
|
+
|
130
|
+
assert self.org_vocab_start_index <= self.org_vocab_end_index
|
131
|
+
assert self.added_vocab_start_index <= self.added_vocab_end_index
|
132
|
+
|
133
|
+
assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
|
134
|
+
assert (self.added_vocab_start_index <=
|
135
|
+
self.padded_added_vocab_start_index)
|
136
|
+
assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
|
137
|
+
assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
|
138
|
+
|
139
|
+
assert self.num_org_elements <= self.num_org_elements_padded
|
140
|
+
assert self.num_added_elements <= self.num_added_elements_padded
|
141
|
+
|
142
|
+
|
143
|
+
@torch.jit.script
|
144
|
+
def get_masked_input_and_mask(
|
145
|
+
input_: torch.Tensor, org_vocab_start_index: int,
|
146
|
+
org_vocab_end_index: int, num_org_vocab_padding: int,
|
147
|
+
added_vocab_start_index: int,
|
148
|
+
added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
149
|
+
# torch.jit.script will fuse all of the pointwise ops below
|
150
|
+
# into a single kernel, making it very fast
|
151
|
+
org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
|
152
|
+
org_vocab_end_index)
|
153
|
+
added_vocab_mask = (input_ >= added_vocab_start_index) & (
|
154
|
+
input_ < added_vocab_end_index)
|
155
|
+
added_offset = added_vocab_start_index - (
|
156
|
+
org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding
|
157
|
+
valid_offset = (org_vocab_start_index *
|
158
|
+
org_vocab_mask) + (added_offset * added_vocab_mask)
|
159
|
+
vocab_mask = org_vocab_mask | added_vocab_mask
|
160
|
+
input_ = vocab_mask * (input_ - valid_offset)
|
161
|
+
return input_, ~vocab_mask
|
162
|
+
|
163
|
+
|
164
|
+
class VocabParallelEmbedding(torch.nn.Module):
|
165
|
+
"""Embedding parallelized in the vocabulary dimension.
|
166
|
+
|
167
|
+
Adapted from torch.nn.Embedding, note that we pad the vocabulary size to
|
168
|
+
make sure it is divisible by the number of model parallel GPUs.
|
169
|
+
|
170
|
+
In order to support various loading methods, we ensure that LoRA-added
|
171
|
+
embeddings are always at the end of TP-sharded tensors. In other words,
|
172
|
+
we shard base embeddings and LoRA embeddings separately (both padded),
|
173
|
+
and place them in the same tensor.
|
174
|
+
In this example, we will have the original vocab size = 1010,
|
175
|
+
added vocab size = 16 and padding to 64. Therefore, the total
|
176
|
+
vocab size with padding will be 1088 (because we first pad 1010 to
|
177
|
+
1024, add 16, and then pad to 1088).
|
178
|
+
Therefore, the tensor format looks like the following:
|
179
|
+
TP1, rank 0 (no sharding):
|
180
|
+
|< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >|
|
181
|
+
corresponding token_id: | 0 | 1 | ... | 1009 | -1 | ... | -1 | 1010 | ... | 1015 | -1 | ... | -1 |
|
182
|
+
index: | 0 | 1 | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 |
|
183
|
+
|
184
|
+
TP2, rank 0:
|
185
|
+
|< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >|
|
186
|
+
corresponding token_id: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 1000 | ... | 1015 | -1 | ... | -1 |
|
187
|
+
index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 527 | 520 | ... | 543 |
|
188
|
+
TP2, rank 1:
|
189
|
+
|< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
|
190
|
+
corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1 | ... | -1 | -1 | ... | -1 | -1 | ... | -1 |
|
191
|
+
index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 519 | 520 | ... | 543 |
|
192
|
+
|
193
|
+
Args:
|
194
|
+
num_embeddings: vocabulary size.
|
195
|
+
embedding_dim: size of hidden state.
|
196
|
+
params_dtype: type of the parameters.
|
197
|
+
org_num_embeddings: original vocabulary size (without LoRA).
|
198
|
+
padding_size: padding size for the vocabulary.
|
199
|
+
quant_config: quant config for the layer
|
200
|
+
prefix: full name of the layer in the state dict
|
201
|
+
""" # noqa: E501
|
202
|
+
|
203
|
+
def __init__(self,
|
204
|
+
num_embeddings: int,
|
205
|
+
embedding_dim: int,
|
206
|
+
params_dtype: Optional[torch.dtype] = None,
|
207
|
+
org_num_embeddings: Optional[int] = None,
|
208
|
+
padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
|
209
|
+
quant_config: Optional[QuantizationConfig] = None,
|
210
|
+
prefix: str = "",
|
211
|
+
enable_tp: bool = True):
|
212
|
+
super().__init__()
|
213
|
+
|
214
|
+
self.enable_tp = enable_tp
|
215
|
+
if self.enable_tp:
|
216
|
+
tp_rank = get_tensor_model_parallel_rank()
|
217
|
+
self.tp_size = get_tensor_model_parallel_world_size()
|
218
|
+
else:
|
219
|
+
tp_rank = 0
|
220
|
+
self.tp_size = 1
|
221
|
+
|
222
|
+
self.num_embeddings = num_embeddings
|
223
|
+
self.padding_size = padding_size
|
224
|
+
self.org_vocab_size = org_num_embeddings or num_embeddings
|
225
|
+
num_added_embeddings = num_embeddings - self.org_vocab_size
|
226
|
+
self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size,
|
227
|
+
self.padding_size)
|
228
|
+
self.num_embeddings_padded = pad_vocab_size(
|
229
|
+
self.org_vocab_size_padded + num_added_embeddings,
|
230
|
+
self.padding_size)
|
231
|
+
assert self.org_vocab_size_padded <= self.num_embeddings_padded
|
232
|
+
|
233
|
+
self.shard_indices = self._get_indices(self.num_embeddings_padded,
|
234
|
+
self.org_vocab_size_padded,
|
235
|
+
self.num_embeddings,
|
236
|
+
self.org_vocab_size, tp_rank,
|
237
|
+
self.tp_size)
|
238
|
+
self.embedding_dim = embedding_dim
|
239
|
+
|
240
|
+
linear_method = None
|
241
|
+
if quant_config is not None:
|
242
|
+
linear_method = quant_config.get_quant_method(self, prefix=prefix)
|
243
|
+
if linear_method is None:
|
244
|
+
linear_method = UnquantizedEmbeddingMethod()
|
245
|
+
|
246
|
+
# If we are making an embedding layer, then our quantization linear
|
247
|
+
# method must implement the embedding operation. If we are another
|
248
|
+
# layer type like ParallelLMHead, this is not important.
|
249
|
+
is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
|
250
|
+
linear_method_implements_embedding = method_has_implemented_embedding(
|
251
|
+
type(linear_method))
|
252
|
+
if is_embedding_layer and not linear_method_implements_embedding:
|
253
|
+
raise NotImplementedError(
|
254
|
+
f"The class {type(linear_method).__name__} must implement "
|
255
|
+
"the 'embedding' method, see UnquantizedEmbeddingMethod.")
|
256
|
+
|
257
|
+
self.linear_method: QuantizeMethodBase = linear_method
|
258
|
+
|
259
|
+
if params_dtype is None:
|
260
|
+
params_dtype = torch.get_default_dtype()
|
261
|
+
# Divide the weight matrix along the vocaburaly dimension.
|
262
|
+
self.num_added_embeddings = self.num_embeddings - self.org_vocab_size
|
263
|
+
self.num_embeddings_per_partition = divide(self.num_embeddings_padded,
|
264
|
+
self.tp_size)
|
265
|
+
assert (self.shard_indices.num_elements_padded ==
|
266
|
+
self.num_embeddings_per_partition)
|
267
|
+
self.num_org_embeddings_per_partition = (
|
268
|
+
self.shard_indices.org_vocab_end_index -
|
269
|
+
self.shard_indices.org_vocab_start_index)
|
270
|
+
self.num_added_embeddings_per_partition = (
|
271
|
+
self.shard_indices.added_vocab_end_index -
|
272
|
+
self.shard_indices.added_vocab_start_index)
|
273
|
+
|
274
|
+
self.linear_method.create_weights(self,
|
275
|
+
self.embedding_dim,
|
276
|
+
[self.num_embeddings_per_partition],
|
277
|
+
self.embedding_dim,
|
278
|
+
self.num_embeddings_padded,
|
279
|
+
params_dtype=params_dtype,
|
280
|
+
weight_loader=self.weight_loader)
|
281
|
+
|
282
|
+
@classmethod
|
283
|
+
def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int,
|
284
|
+
vocab_size: int, org_vocab_size: int, tp_rank: int,
|
285
|
+
tp_size: int) -> VocabParallelEmbeddingShardIndices:
|
286
|
+
"""Get start and end indices for vocab parallel embedding, following the
|
287
|
+
layout outlined in the class docstring, based on the given tp_rank and
|
288
|
+
tp_size."""
|
289
|
+
num_added_embeddings_padded = vocab_size_padded - org_vocab_size_padded
|
290
|
+
padded_org_vocab_start_index, padded_org_vocab_end_index = (
|
291
|
+
vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank,
|
292
|
+
tp_size))
|
293
|
+
padded_added_vocab_start_index, padded_added_vocab_end_index = (
|
294
|
+
vocab_range_from_global_vocab_size(num_added_embeddings_padded,
|
295
|
+
tp_rank,
|
296
|
+
tp_size,
|
297
|
+
offset=org_vocab_size))
|
298
|
+
# remove padding
|
299
|
+
org_vocab_start_index = min(padded_org_vocab_start_index,
|
300
|
+
org_vocab_size)
|
301
|
+
org_vocab_end_index = min(padded_org_vocab_end_index, org_vocab_size)
|
302
|
+
added_vocab_start_index = min(padded_added_vocab_start_index,
|
303
|
+
vocab_size)
|
304
|
+
added_vocab_end_index = min(padded_added_vocab_end_index, vocab_size)
|
305
|
+
return VocabParallelEmbeddingShardIndices(
|
306
|
+
padded_org_vocab_start_index, padded_org_vocab_end_index,
|
307
|
+
padded_added_vocab_start_index, padded_added_vocab_end_index,
|
308
|
+
org_vocab_start_index, org_vocab_end_index,
|
309
|
+
added_vocab_start_index, added_vocab_end_index)
|
310
|
+
|
311
|
+
def get_sharded_to_full_mapping(self) -> Optional[List[int]]:
|
312
|
+
"""Get a mapping that can be used to reindex the gathered
|
313
|
+
logits for sampling.
|
314
|
+
|
315
|
+
During sampling, we gather logits from all ranks. The relationship
|
316
|
+
of index->token_id will follow the same format as outlined in the class
|
317
|
+
docstring. However, after the gather, we want to reindex the final
|
318
|
+
logits tensor to map index->token_id one-to-one (the index is always
|
319
|
+
equal the token_id it corresponds to). The indices returned by this
|
320
|
+
method allow us to do that.
|
321
|
+
"""
|
322
|
+
if self.tp_size < 2:
|
323
|
+
return None
|
324
|
+
|
325
|
+
base_embeddings: List[int] = []
|
326
|
+
added_embeddings: List[int] = []
|
327
|
+
padding: List[int] = []
|
328
|
+
for tp_rank in range(self.tp_size):
|
329
|
+
shard_indices = self._get_indices(self.num_embeddings_padded,
|
330
|
+
self.org_vocab_size_padded,
|
331
|
+
self.num_embeddings,
|
332
|
+
self.org_vocab_size, tp_rank,
|
333
|
+
self.tp_size)
|
334
|
+
range_start = self.num_embeddings_per_partition * tp_rank
|
335
|
+
range_end = self.num_embeddings_per_partition * (tp_rank + 1)
|
336
|
+
base_embeddings.extend(
|
337
|
+
range(range_start,
|
338
|
+
range_start + shard_indices.num_org_elements))
|
339
|
+
padding.extend(
|
340
|
+
range(range_start + shard_indices.num_org_elements,
|
341
|
+
range_start + shard_indices.num_org_elements_padded))
|
342
|
+
added_embeddings.extend(
|
343
|
+
range(
|
344
|
+
range_start + shard_indices.num_org_elements_padded,
|
345
|
+
range_start + shard_indices.num_org_elements_padded +
|
346
|
+
shard_indices.num_added_elements))
|
347
|
+
padding.extend(
|
348
|
+
range(
|
349
|
+
range_start + shard_indices.num_org_elements_padded +
|
350
|
+
shard_indices.num_added_elements,
|
351
|
+
range_start + shard_indices.num_org_elements_padded +
|
352
|
+
shard_indices.num_added_elements_padded))
|
353
|
+
assert (range_start + shard_indices.num_org_elements_padded +
|
354
|
+
shard_indices.num_added_elements_padded == range_end)
|
355
|
+
ret = base_embeddings + added_embeddings + padding
|
356
|
+
assert len(ret) == self.num_embeddings_padded
|
357
|
+
return ret
|
358
|
+
|
359
|
+
def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
|
360
|
+
output_dim = getattr(param, "output_dim", None)
|
361
|
+
packed_dim = getattr(param, "packed_dim", None)
|
362
|
+
|
363
|
+
# If the parameter is a gguf weight, then load it directly.
|
364
|
+
if getattr(param, "is_gguf_weight_type", None):
|
365
|
+
param.data.copy_(loaded_weight)
|
366
|
+
param.weight_type = loaded_weight.item()
|
367
|
+
return
|
368
|
+
elif isinstance(param, UninitializedParameter):
|
369
|
+
shape = list(loaded_weight.shape)
|
370
|
+
if output_dim is not None:
|
371
|
+
shape[output_dim] = shape[output_dim] // self.tp_size
|
372
|
+
param.materialize(tuple(shape), dtype=loaded_weight.dtype)
|
373
|
+
|
374
|
+
# If parameter does not have output dim, then it should
|
375
|
+
# be copied onto all gpus (e.g. g_idx for act_order gptq).
|
376
|
+
if output_dim is None:
|
377
|
+
assert param.data.shape == loaded_weight.shape
|
378
|
+
param.data.copy_(loaded_weight)
|
379
|
+
return
|
380
|
+
|
381
|
+
# Shard indexes for loading the weight
|
382
|
+
start_idx = self.shard_indices.org_vocab_start_index
|
383
|
+
shard_size = self.shard_indices.org_vocab_end_index - start_idx
|
384
|
+
|
385
|
+
# If param packed on the same dim we are sharding on, then
|
386
|
+
# need to adjust offsets of loaded weight by pack_factor.
|
387
|
+
if packed_dim is not None and packed_dim == output_dim:
|
388
|
+
packed_factor = param.packed_factor if isinstance(
|
389
|
+
param, BasevLLMParameter) else param.pack_factor
|
390
|
+
assert loaded_weight.shape[output_dim] == (self.org_vocab_size //
|
391
|
+
param.packed_factor)
|
392
|
+
start_idx = start_idx // packed_factor
|
393
|
+
shard_size = shard_size // packed_factor
|
394
|
+
else:
|
395
|
+
assert loaded_weight.shape[output_dim] == self.org_vocab_size
|
396
|
+
|
397
|
+
# Copy the data.
|
398
|
+
loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
|
399
|
+
param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
|
400
|
+
param[loaded_weight.shape[0]:].data.fill_(0)
|
401
|
+
|
402
|
+
def forward(self, input_):
|
403
|
+
if self.tp_size > 1:
|
404
|
+
# Build the mask.
|
405
|
+
masked_input, input_mask = get_masked_input_and_mask(
|
406
|
+
input_, self.shard_indices.org_vocab_start_index,
|
407
|
+
self.shard_indices.org_vocab_end_index,
|
408
|
+
self.shard_indices.num_org_vocab_padding,
|
409
|
+
self.shard_indices.added_vocab_start_index,
|
410
|
+
self.shard_indices.added_vocab_end_index)
|
411
|
+
else:
|
412
|
+
masked_input = input_
|
413
|
+
# Get the embeddings.
|
414
|
+
output_parallel = self.linear_method.embedding(self,
|
415
|
+
masked_input.long())
|
416
|
+
# Mask the output embedding.
|
417
|
+
if self.tp_size > 1:
|
418
|
+
output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
|
419
|
+
# Reduce across all the model parallel GPUs.
|
420
|
+
output = tensor_model_parallel_all_reduce(output_parallel)
|
421
|
+
else:
|
422
|
+
output = output_parallel
|
423
|
+
return output
|
424
|
+
|
425
|
+
def extra_repr(self) -> str:
|
426
|
+
s = f"num_embeddings={self.num_embeddings_per_partition}"
|
427
|
+
s += f", embedding_dim={self.embedding_dim}"
|
428
|
+
s += f", org_vocab_size={self.org_vocab_size}"
|
429
|
+
s += f', num_embeddings_padded={self.num_embeddings_padded}'
|
430
|
+
if self.enable_tp:
|
431
|
+
s += f', tp_size={self.tp_size}'
|
432
|
+
return s
|
433
|
+
|
434
|
+
|
435
|
+
class ParallelLMHead(VocabParallelEmbedding):
|
436
|
+
"""Parallelized LM head.
|
437
|
+
|
438
|
+
Output logits weight matrices used in the Sampler. The weight and bias
|
439
|
+
tensors are padded to make sure they are divisible by the number of
|
440
|
+
model parallel GPUs.
|
441
|
+
|
442
|
+
Args:
|
443
|
+
num_embeddings: vocabulary size.
|
444
|
+
embedding_dim: size of hidden state.
|
445
|
+
bias: whether to use bias.
|
446
|
+
params_dtype: type of the parameters.
|
447
|
+
org_num_embeddings: original vocabulary size (without LoRA).
|
448
|
+
padding_size: padding size for the vocabulary.
|
449
|
+
"""
|
450
|
+
|
451
|
+
def __init__(self,
|
452
|
+
num_embeddings: int,
|
453
|
+
embedding_dim: int,
|
454
|
+
bias: bool = False,
|
455
|
+
params_dtype: Optional[torch.dtype] = None,
|
456
|
+
org_num_embeddings: Optional[int] = None,
|
457
|
+
padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
|
458
|
+
quant_config: Optional[QuantizationConfig] = None,
|
459
|
+
prefix: str = ""):
|
460
|
+
super().__init__(num_embeddings, embedding_dim, params_dtype,
|
461
|
+
org_num_embeddings, padding_size, quant_config,
|
462
|
+
prefix)
|
463
|
+
self.quant_config = quant_config
|
464
|
+
if bias:
|
465
|
+
self.bias = Parameter(
|
466
|
+
torch.empty(self.num_embeddings_per_partition,
|
467
|
+
dtype=params_dtype))
|
468
|
+
set_weight_attrs(self.bias, {
|
469
|
+
"output_dim": 0,
|
470
|
+
"weight_loader": self.weight_loader,
|
471
|
+
})
|
472
|
+
else:
|
473
|
+
self.register_parameter("bias", None)
|
474
|
+
|
475
|
+
def tie_weights(self, embed_tokens: VocabParallelEmbedding):
|
476
|
+
"""Tie the weights with word embeddings."""
|
477
|
+
# GGUF quantized embed_tokens.
|
478
|
+
if self.quant_config and self.quant_config.get_name() == "gguf":
|
479
|
+
return embed_tokens
|
480
|
+
else:
|
481
|
+
self.weight = embed_tokens.weight
|
482
|
+
return self
|
483
|
+
|
484
|
+
def forward(self, input_):
|
485
|
+
del input_
|
486
|
+
raise RuntimeError("LMHead's weights should be used in the sampler.")
|
@@ -24,12 +24,12 @@ import zmq
|
|
24
24
|
from sglang.srt.managers.io_struct import (
|
25
25
|
TokenizedEmbeddingReqInput,
|
26
26
|
TokenizedGenerateReqInput,
|
27
|
-
TokenizedRewardReqInput,
|
28
27
|
)
|
29
28
|
from sglang.srt.managers.scheduler import run_scheduler_process
|
30
29
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
31
30
|
from sglang.srt.utils import (
|
32
31
|
configure_logger,
|
32
|
+
get_zmq_socket,
|
33
33
|
kill_parent_process,
|
34
34
|
suppress_other_loggers,
|
35
35
|
)
|
@@ -66,8 +66,9 @@ class DataParallelController:
|
|
66
66
|
|
67
67
|
# Init inter-process communication
|
68
68
|
self.context = zmq.Context(1 + server_args.dp_size)
|
69
|
-
self.recv_from_tokenizer =
|
70
|
-
|
69
|
+
self.recv_from_tokenizer = get_zmq_socket(
|
70
|
+
self.context, zmq.PULL, port_args.scheduler_input_ipc_name
|
71
|
+
)
|
71
72
|
|
72
73
|
# Dispatch method
|
73
74
|
self.round_robin_counter = 0
|
@@ -120,8 +121,9 @@ class DataParallelController:
|
|
120
121
|
scheduler_procs.append(proc)
|
121
122
|
scheduler_pipe_readers.append(reader)
|
122
123
|
|
123
|
-
send_to =
|
124
|
-
|
124
|
+
send_to = get_zmq_socket(
|
125
|
+
self.context, zmq.PUSH, port_args.scheduler_input_ipc_name
|
126
|
+
)
|
125
127
|
|
126
128
|
# Wait for model to finish loading
|
127
129
|
for i in range(len(scheduler_pipe_readers)):
|
@@ -149,7 +151,6 @@ class DataParallelController:
|
|
149
151
|
(
|
150
152
|
TokenizedGenerateReqInput,
|
151
153
|
TokenizedEmbeddingReqInput,
|
152
|
-
TokenizedRewardReqInput,
|
153
154
|
),
|
154
155
|
):
|
155
156
|
self.dispatching(recv_req)
|
@@ -32,7 +32,7 @@ from sglang.srt.managers.io_struct import (
|
|
32
32
|
)
|
33
33
|
from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN
|
34
34
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
35
|
-
from sglang.srt.utils import configure_logger, kill_parent_process
|
35
|
+
from sglang.srt.utils import configure_logger, get_zmq_socket, kill_parent_process
|
36
36
|
from sglang.utils import find_printable_text, get_exception_traceback
|
37
37
|
|
38
38
|
logger = logging.getLogger(__name__)
|
@@ -59,11 +59,12 @@ class DetokenizerManager:
|
|
59
59
|
):
|
60
60
|
# Init inter-process communication
|
61
61
|
context = zmq.Context(2)
|
62
|
-
self.recv_from_scheduler =
|
63
|
-
|
64
|
-
|
65
|
-
self.send_to_tokenizer =
|
66
|
-
|
62
|
+
self.recv_from_scheduler = get_zmq_socket(
|
63
|
+
context, zmq.PULL, port_args.detokenizer_ipc_name
|
64
|
+
)
|
65
|
+
self.send_to_tokenizer = get_zmq_socket(
|
66
|
+
context, zmq.PUSH, port_args.tokenizer_ipc_name
|
67
|
+
)
|
67
68
|
|
68
69
|
if server_args.skip_tokenizer_init:
|
69
70
|
self.tokenizer = None
|
@@ -115,12 +116,9 @@ class DetokenizerManager:
|
|
115
116
|
elif isinstance(recv_obj, GetMemPoolSizeReqOutput):
|
116
117
|
self.send_to_tokenizer.send_pyobj(recv_obj)
|
117
118
|
continue
|
118
|
-
|
119
|
-
|
120
|
-
self.send_to_tokenizer.send_pyobj(recv_obj)
|
121
|
-
continue
|
119
|
+
else:
|
120
|
+
assert isinstance(recv_obj, BatchTokenIDOut)
|
122
121
|
|
123
|
-
assert isinstance(recv_obj, BatchTokenIDOut)
|
124
122
|
bs = len(recv_obj.rids)
|
125
123
|
|
126
124
|
# Initialize decode status
|
@@ -36,11 +36,12 @@ class BaseImageProcessor(ABC):
|
|
36
36
|
def __init__(self, hf_config, server_args, _processor):
|
37
37
|
self.hf_config = hf_config
|
38
38
|
self._processor = _processor
|
39
|
+
|
39
40
|
self.executor = concurrent.futures.ProcessPoolExecutor(
|
40
41
|
initializer=init_global_processor,
|
41
42
|
mp_context=mp.get_context("fork"),
|
42
43
|
initargs=(server_args,),
|
43
|
-
max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
|
44
|
+
max_workers=int(os.environ.get("SGLANG_CPU_COUNT", os.cpu_count())),
|
44
45
|
)
|
45
46
|
|
46
47
|
@abstractmethod
|
@@ -179,7 +180,7 @@ class LlavaImageProcessor(BaseImageProcessor):
|
|
179
180
|
"pixel_values": pixel_values,
|
180
181
|
"image_hashes": image_hashes,
|
181
182
|
"image_sizes": image_sizes,
|
182
|
-
"modalities": request_obj.modalities,
|
183
|
+
"modalities": request_obj.modalities or ["image"],
|
183
184
|
}
|
184
185
|
|
185
186
|
|
@@ -239,7 +240,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
|
|
239
240
|
initializer=init_global_processor,
|
240
241
|
mp_context=mp.get_context("fork"),
|
241
242
|
initargs=(server_args,),
|
242
|
-
max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
|
243
|
+
max_workers=int(os.environ.get("SGLANG_CPU_COUNT", os.cpu_count())),
|
243
244
|
)
|
244
245
|
|
245
246
|
@staticmethod
|