sglang 0.3.5__py3-none-any.whl → 0.3.5.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +309 -0
- sglang/bench_serving.py +148 -24
- sglang/srt/configs/model_config.py +5 -2
- sglang/srt/constrained/__init__.py +2 -66
- sglang/srt/constrained/base_grammar_backend.py +73 -0
- sglang/srt/constrained/outlines_backend.py +165 -0
- sglang/srt/constrained/outlines_jump_forward.py +182 -0
- sglang/srt/constrained/xgrammar_backend.py +150 -0
- sglang/srt/layers/attention/triton_ops/decode_attention.py +7 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +6 -0
- sglang/srt/layers/fused_moe/fused_moe.py +23 -7
- sglang/srt/layers/fused_moe/patch.py +4 -2
- sglang/srt/layers/quantization/base_config.py +4 -6
- sglang/srt/layers/vocab_parallel_embedding.py +216 -150
- sglang/srt/managers/detokenizer_manager.py +0 -14
- sglang/srt/managers/io_struct.py +5 -3
- sglang/srt/managers/schedule_batch.py +14 -20
- sglang/srt/managers/scheduler.py +159 -96
- sglang/srt/managers/tokenizer_manager.py +81 -17
- sglang/srt/metrics/collector.py +211 -0
- sglang/srt/metrics/func_timer.py +108 -0
- sglang/srt/mm_utils.py +1 -1
- sglang/srt/model_executor/cuda_graph_runner.py +2 -2
- sglang/srt/model_executor/forward_batch_info.py +7 -3
- sglang/srt/model_executor/model_runner.py +6 -2
- sglang/srt/models/gemma2_reward.py +69 -0
- sglang/srt/models/gpt2.py +31 -37
- sglang/srt/models/internlm2_reward.py +62 -0
- sglang/srt/models/llama.py +11 -6
- sglang/srt/models/llama_reward.py +5 -26
- sglang/srt/models/qwen2_vl.py +5 -7
- sglang/srt/openai_api/adapter.py +11 -4
- sglang/srt/openai_api/protocol.py +29 -26
- sglang/srt/sampling/sampling_batch_info.py +2 -3
- sglang/srt/sampling/sampling_params.py +2 -16
- sglang/srt/server.py +60 -17
- sglang/srt/server_args.py +66 -25
- sglang/srt/utils.py +120 -0
- sglang/test/simple_eval_common.py +1 -1
- sglang/test/simple_eval_humaneval.py +2 -2
- sglang/test/simple_eval_mgsm.py +2 -2
- sglang/test/test_utils.py +21 -7
- sglang/utils.py +1 -0
- sglang/version.py +1 -1
- {sglang-0.3.5.dist-info → sglang-0.3.5.post2.dist-info}/METADATA +12 -8
- {sglang-0.3.5.dist-info → sglang-0.3.5.post2.dist-info}/RECORD +49 -45
- {sglang-0.3.5.dist-info → sglang-0.3.5.post2.dist-info}/WHEEL +1 -1
- sglang/srt/constrained/base_tool_cache.py +0 -65
- sglang/srt/constrained/bnf_cache.py +0 -61
- sglang/srt/constrained/fsm_cache.py +0 -95
- sglang/srt/constrained/grammar.py +0 -190
- sglang/srt/constrained/jump_forward.py +0 -203
- {sglang-0.3.5.dist-info → sglang-0.3.5.post2.dist-info}/LICENSE +0 -0
- {sglang-0.3.5.dist-info → sglang-0.3.5.post2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,69 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
16
|
+
from typing import Iterable, Optional, Tuple
|
17
|
+
|
18
|
+
import torch
|
19
|
+
from torch import nn
|
20
|
+
from transformers import Gemma2Config
|
21
|
+
|
22
|
+
from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
|
23
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
24
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
25
|
+
from sglang.srt.models.gemma2 import Gemma2ForCausalLM, Gemma2Model
|
26
|
+
|
27
|
+
|
28
|
+
class Gemma2ForSequenceClassification(nn.Module):
|
29
|
+
def __init__(
|
30
|
+
self,
|
31
|
+
config: Gemma2Config,
|
32
|
+
quant_config: Optional[QuantizationConfig] = None,
|
33
|
+
cache_config=None,
|
34
|
+
) -> None:
|
35
|
+
super().__init__()
|
36
|
+
self.config = config
|
37
|
+
self.torchao_config = None
|
38
|
+
self.quant_config = quant_config
|
39
|
+
self.num_labels = config.num_labels
|
40
|
+
self.model = Gemma2Model(config, quant_config=quant_config)
|
41
|
+
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
|
42
|
+
self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
|
43
|
+
|
44
|
+
self.eos_token_id = config.eos_token_id
|
45
|
+
|
46
|
+
@torch.no_grad()
|
47
|
+
def forward(
|
48
|
+
self,
|
49
|
+
input_ids: torch.Tensor,
|
50
|
+
positions: torch.Tensor,
|
51
|
+
forward_batch: ForwardBatch,
|
52
|
+
input_embeds: torch.Tensor = None,
|
53
|
+
get_embedding: bool = True,
|
54
|
+
) -> EmbeddingPoolerOutput:
|
55
|
+
assert (
|
56
|
+
get_embedding
|
57
|
+
), "Gemma2ForSequenceClassification is only used for embedding"
|
58
|
+
|
59
|
+
hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
|
60
|
+
last_token_hidden = self.pooler(hidden_states, forward_batch).embeddings
|
61
|
+
scores = self.score(last_token_hidden)
|
62
|
+
|
63
|
+
return EmbeddingPoolerOutput(scores)
|
64
|
+
|
65
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
66
|
+
Gemma2ForCausalLM.load_weights(self, weights)
|
67
|
+
|
68
|
+
|
69
|
+
EntryClass = [Gemma2ForSequenceClassification]
|
sglang/srt/models/gpt2.py
CHANGED
@@ -28,7 +28,7 @@ from vllm.model_executor.layers.activation import get_act_fn
|
|
28
28
|
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
29
29
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
30
30
|
|
31
|
-
#from sglang.srt.layers.activation import get_act_fn
|
31
|
+
# from sglang.srt.layers.activation import get_act_fn
|
32
32
|
from sglang.srt.layers.linear import (
|
33
33
|
ColumnParallelLinear,
|
34
34
|
QKVParallelLinear,
|
@@ -47,15 +47,14 @@ class GPT2Attention(nn.Module):
|
|
47
47
|
self,
|
48
48
|
layer_id: int,
|
49
49
|
config: GPT2Config,
|
50
|
-
cache_config
|
50
|
+
cache_config=None,
|
51
51
|
quant_config: Optional[QuantizationConfig] = None,
|
52
52
|
prefix: str = "",
|
53
53
|
):
|
54
54
|
super().__init__()
|
55
55
|
self.hidden_size = config.hidden_size
|
56
56
|
total_num_heads = config.num_attention_heads
|
57
|
-
tensor_model_parallel_world_size = (
|
58
|
-
get_tensor_model_parallel_world_size())
|
57
|
+
tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
|
59
58
|
assert total_num_heads % tensor_model_parallel_world_size == 0
|
60
59
|
self.num_heads = total_num_heads // tensor_model_parallel_world_size
|
61
60
|
self.head_dim = self.hidden_size // total_num_heads
|
@@ -76,11 +75,13 @@ class GPT2Attention(nn.Module):
|
|
76
75
|
quant_config=quant_config,
|
77
76
|
prefix=f"{prefix}.c_proj",
|
78
77
|
)
|
79
|
-
self.attn = RadixAttention(
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
78
|
+
self.attn = RadixAttention(
|
79
|
+
self.num_heads,
|
80
|
+
self.head_dim,
|
81
|
+
scaling=self.scale,
|
82
|
+
num_kv_heads=total_num_heads,
|
83
|
+
layer_id=layer_id,
|
84
|
+
)
|
84
85
|
|
85
86
|
def forward(
|
86
87
|
self,
|
@@ -119,10 +120,14 @@ class GPT2MLP(nn.Module):
|
|
119
120
|
quant_config=quant_config,
|
120
121
|
prefix=f"{prefix}.c_proj",
|
121
122
|
)
|
122
|
-
self.act = get_act_fn(
|
123
|
-
|
123
|
+
self.act = get_act_fn(
|
124
|
+
config.activation_function, quant_config, intermediate_size
|
125
|
+
)
|
124
126
|
|
125
|
-
def forward(
|
127
|
+
def forward(
|
128
|
+
self,
|
129
|
+
hidden_states: torch.Tensor,
|
130
|
+
) -> torch.Tensor:
|
126
131
|
hidden_states, _ = self.c_fc(hidden_states)
|
127
132
|
hidden_states = self.act(hidden_states)
|
128
133
|
hidden_states, _ = self.c_proj(hidden_states)
|
@@ -135,27 +140,20 @@ class GPT2Block(nn.Module):
|
|
135
140
|
self,
|
136
141
|
layer_id: int,
|
137
142
|
config: GPT2Config,
|
138
|
-
cache_config
|
139
|
-
|
143
|
+
cache_config=None,
|
140
144
|
quant_config: Optional[QuantizationConfig] = None,
|
141
145
|
prefix: str = "",
|
142
146
|
):
|
143
147
|
super().__init__()
|
144
148
|
hidden_size = config.hidden_size
|
145
|
-
inner_dim =
|
146
|
-
hidden_size)
|
149
|
+
inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
|
147
150
|
|
148
151
|
self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
|
149
|
-
self.attn = GPT2Attention(
|
150
|
-
|
151
|
-
|
152
|
-
quant_config,
|
153
|
-
prefix=f"{prefix}.attn")
|
152
|
+
self.attn = GPT2Attention(
|
153
|
+
layer_id, config, cache_config, quant_config, prefix=f"{prefix}.attn"
|
154
|
+
)
|
154
155
|
self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
|
155
|
-
self.mlp = GPT2MLP(inner_dim,
|
156
|
-
config,
|
157
|
-
quant_config,
|
158
|
-
prefix=f"{prefix}.mlp")
|
156
|
+
self.mlp = GPT2MLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp")
|
159
157
|
|
160
158
|
def forward(
|
161
159
|
self,
|
@@ -179,13 +177,12 @@ class GPT2Block(nn.Module):
|
|
179
177
|
return hidden_states
|
180
178
|
|
181
179
|
|
182
|
-
|
183
180
|
class GPT2Model(nn.Module):
|
184
181
|
|
185
182
|
def __init__(
|
186
183
|
self,
|
187
184
|
config: GPT2Config,
|
188
|
-
cache_config
|
185
|
+
cache_config=None,
|
189
186
|
quant_config: Optional[QuantizationConfig] = None,
|
190
187
|
prefix: str = "",
|
191
188
|
):
|
@@ -229,16 +226,15 @@ class GPT2LMHeadModel(nn.Module):
|
|
229
226
|
def __init__(
|
230
227
|
self,
|
231
228
|
config: GPT2Config,
|
232
|
-
cache_config
|
229
|
+
cache_config=None,
|
233
230
|
quant_config: Optional[QuantizationConfig] = None,
|
234
231
|
):
|
235
232
|
super().__init__()
|
236
233
|
self.config = config
|
237
234
|
self.quant_config = quant_config
|
238
|
-
self.transformer = GPT2Model(
|
239
|
-
|
240
|
-
|
241
|
-
prefix="transformer")
|
235
|
+
self.transformer = GPT2Model(
|
236
|
+
config, cache_config, quant_config, prefix="transformer"
|
237
|
+
)
|
242
238
|
self.lm_head = self.transformer.wte
|
243
239
|
|
244
240
|
self.logits_processor = LogitsProcessor(config)
|
@@ -254,8 +250,6 @@ class GPT2LMHeadModel(nn.Module):
|
|
254
250
|
input_ids, hidden_states, self.lm_head.weight, forward_batch
|
255
251
|
)
|
256
252
|
|
257
|
-
|
258
|
-
|
259
253
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
260
254
|
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
261
255
|
for name, loaded_weight in weights:
|
@@ -280,8 +274,8 @@ class GPT2LMHeadModel(nn.Module):
|
|
280
274
|
if not name.endswith(".weight"):
|
281
275
|
continue
|
282
276
|
loaded_weight = loaded_weight.t()
|
283
|
-
weight_loader = getattr(param, "weight_loader",
|
284
|
-
default_weight_loader)
|
277
|
+
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
285
278
|
weight_loader(param, loaded_weight)
|
286
279
|
|
287
|
-
|
280
|
+
|
281
|
+
EntryClass = GPT2LMHeadModel
|
@@ -0,0 +1,62 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
16
|
+
from typing import Iterable, Optional, Tuple
|
17
|
+
|
18
|
+
import torch
|
19
|
+
from torch import nn
|
20
|
+
from transformers import PretrainedConfig
|
21
|
+
|
22
|
+
from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
|
23
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
24
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
25
|
+
from sglang.srt.models.internlm2 import InternLM2ForCausalLM, InternLM2Model
|
26
|
+
|
27
|
+
|
28
|
+
class InternLM2ForRewardModel(nn.Module):
|
29
|
+
def __init__(
|
30
|
+
self,
|
31
|
+
config: PretrainedConfig,
|
32
|
+
quant_config: Optional[QuantizationConfig] = None,
|
33
|
+
cache_config=None,
|
34
|
+
) -> None:
|
35
|
+
super().__init__()
|
36
|
+
self.config = config
|
37
|
+
self.quant_config = quant_config
|
38
|
+
self.vocab_size = config.vocab_size
|
39
|
+
self.model = InternLM2Model(config, quant_config)
|
40
|
+
self.v_head = nn.Linear(config.hidden_size, 1, bias=False)
|
41
|
+
self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
|
42
|
+
|
43
|
+
@torch.no_grad()
|
44
|
+
def forward(
|
45
|
+
self,
|
46
|
+
input_ids: torch.Tensor,
|
47
|
+
positions: torch.Tensor,
|
48
|
+
forward_batch: ForwardBatch,
|
49
|
+
input_embeds: torch.Tensor = None,
|
50
|
+
get_embedding: bool = True,
|
51
|
+
) -> EmbeddingPoolerOutput:
|
52
|
+
assert get_embedding, "InternLM2ForRewardModel is only used for embedding"
|
53
|
+
hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
|
54
|
+
last_token_hidden = self.pooler(hidden_states, forward_batch).embeddings
|
55
|
+
scores = self.v_head(last_token_hidden)
|
56
|
+
return EmbeddingPoolerOutput(scores)
|
57
|
+
|
58
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
59
|
+
return InternLM2ForCausalLM.load_weights(self, weights)
|
60
|
+
|
61
|
+
|
62
|
+
EntryClass = InternLM2ForRewardModel
|
sglang/srt/models/llama.py
CHANGED
@@ -380,6 +380,12 @@ class LlamaForCausalLM(nn.Module):
|
|
380
380
|
]
|
381
381
|
params_dict = dict(self.named_parameters())
|
382
382
|
|
383
|
+
load_tie_word_embeddings = (
|
384
|
+
hasattr(self.config, "tie_word_embeddings")
|
385
|
+
and self.config.tie_word_embeddings
|
386
|
+
and "lm_head.weight" in params_dict
|
387
|
+
)
|
388
|
+
|
383
389
|
for name, loaded_weight in weights:
|
384
390
|
if "rotary_emb.inv_freq" in name or "projector" in name:
|
385
391
|
continue
|
@@ -412,15 +418,14 @@ class LlamaForCausalLM(nn.Module):
|
|
412
418
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
413
419
|
weight_loader(param, loaded_weight)
|
414
420
|
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
):
|
421
|
+
if load_tie_word_embeddings and name == "model.embed_tokens.weight":
|
422
|
+
embed_tokens_weight = loaded_weight
|
423
|
+
|
424
|
+
if load_tie_word_embeddings:
|
420
425
|
# Tie output embedding layer to input embedding layer, to solve issues where lm_head.weight is missing
|
421
426
|
param = self.lm_head.weight
|
422
427
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
423
|
-
weight_loader(param,
|
428
|
+
weight_loader(param, embed_tokens_weight)
|
424
429
|
|
425
430
|
apply_torchao_config_(self, params_dict, set(["proj.weight"]))
|
426
431
|
|
@@ -18,9 +18,7 @@ from typing import Iterable, Optional, Tuple
|
|
18
18
|
import torch
|
19
19
|
from torch import nn
|
20
20
|
from transformers import LlamaConfig
|
21
|
-
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
22
21
|
|
23
|
-
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
24
22
|
from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
|
25
23
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
26
24
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
@@ -59,22 +57,13 @@ class LlamaForSequenceClassification(nn.Module):
|
|
59
57
|
), "LlamaForSequenceClassification is only used for embedding"
|
60
58
|
|
61
59
|
hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
|
62
|
-
|
60
|
+
last_token_hidden = self.pooler(hidden_states, forward_batch).embeddings
|
61
|
+
scores = self.score(last_token_hidden)
|
63
62
|
|
64
|
-
return
|
63
|
+
return EmbeddingPoolerOutput(scores)
|
65
64
|
|
66
65
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
67
|
-
|
68
|
-
|
69
|
-
for name, loaded_weight in weights:
|
70
|
-
if "classification_head" in name:
|
71
|
-
param = params_dict[name]
|
72
|
-
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
73
|
-
weight_loader(param, loaded_weight)
|
74
|
-
elif "lm_head" in name:
|
75
|
-
continue
|
76
|
-
else:
|
77
|
-
LlamaForCausalLM.load_weights(self, [(name, loaded_weight)])
|
66
|
+
return LlamaForCausalLM.load_weights(self, weights)
|
78
67
|
|
79
68
|
|
80
69
|
class LlamaForSequenceClassificationWithNormal_Weights(LlamaForSequenceClassification):
|
@@ -127,17 +116,7 @@ class LlamaForSequenceClassificationWithNormal_Weights(LlamaForSequenceClassific
|
|
127
116
|
return EmbeddingPoolerOutput(scores)
|
128
117
|
|
129
118
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
130
|
-
|
131
|
-
|
132
|
-
for name, loaded_weight in weights:
|
133
|
-
if "classification_head" in name:
|
134
|
-
param = params_dict[name]
|
135
|
-
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
136
|
-
weight_loader(param, loaded_weight)
|
137
|
-
elif "lm_head" in name:
|
138
|
-
continue
|
139
|
-
else:
|
140
|
-
LlamaForCausalLM.load_weights(self, [(name, loaded_weight)])
|
119
|
+
return super().load_weights(weights)
|
141
120
|
|
142
121
|
|
143
122
|
EntryClass = [
|
sglang/srt/models/qwen2_vl.py
CHANGED
@@ -57,27 +57,27 @@ logger = init_logger(__name__)
|
|
57
57
|
|
58
58
|
class Qwen2VLImageInputs(TypedDict):
|
59
59
|
pixel_values: torch.Tensor
|
60
|
-
"""Shape:
|
60
|
+
"""Shape:
|
61
61
|
`(num_patches, num_channels * patch_size * patch_size)`
|
62
62
|
"""
|
63
63
|
|
64
64
|
image_grid_thw: torch.Tensor
|
65
65
|
"""Shape: `(num_images, 3)`
|
66
|
-
|
66
|
+
|
67
67
|
This should be in `(grid_t, grid_h, grid_w)` format.
|
68
68
|
"""
|
69
69
|
|
70
70
|
|
71
71
|
class Qwen2VLVideoInputs(TypedDict):
|
72
72
|
pixel_values_videos: torch.Tensor
|
73
|
-
"""Shape:
|
74
|
-
`(num_patches,
|
73
|
+
"""Shape:
|
74
|
+
`(num_patches,
|
75
75
|
num_channels * temporal_patch_size * patch_size * patch_size)`
|
76
76
|
"""
|
77
77
|
|
78
78
|
video_grid_thw: torch.Tensor
|
79
79
|
"""Shape: `(num_videos, 3)`
|
80
|
-
|
80
|
+
|
81
81
|
This should be in `(grid_t, grid_h, grid_w)` format.
|
82
82
|
"""
|
83
83
|
|
@@ -649,8 +649,6 @@ class Qwen2VLForConditionalGeneration(nn.Module):
|
|
649
649
|
]
|
650
650
|
image_embeds_offset += num_image_tokens
|
651
651
|
|
652
|
-
input_ids = None
|
653
|
-
|
654
652
|
hidden_states = self.model(
|
655
653
|
input_ids=input_ids,
|
656
654
|
positions=positions,
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -498,6 +498,10 @@ def v1_generate_request(
|
|
498
498
|
)
|
499
499
|
|
500
500
|
prompts.append(request.prompt)
|
501
|
+
if request.echo and request.logprobs:
|
502
|
+
current_logprob_start_len = 0
|
503
|
+
else:
|
504
|
+
current_logprob_start_len = -1
|
501
505
|
sampling_params_list.append(
|
502
506
|
{
|
503
507
|
"temperature": request.temperature,
|
@@ -512,12 +516,13 @@ def v1_generate_request(
|
|
512
516
|
"regex": request.regex,
|
513
517
|
"json_schema": request.json_schema,
|
514
518
|
"n": request.n,
|
515
|
-
"ignore_eos": request.ignore_eos,
|
516
519
|
"no_stop_trim": request.no_stop_trim,
|
520
|
+
"ignore_eos": request.ignore_eos,
|
521
|
+
"skip_special_tokens": request.skip_special_tokens,
|
517
522
|
}
|
518
523
|
)
|
519
524
|
return_logprobs.append(request.logprobs is not None and request.logprobs > 0)
|
520
|
-
logprob_start_lens.append(
|
525
|
+
logprob_start_lens.append(current_logprob_start_len)
|
521
526
|
top_logprobs_nums.append(
|
522
527
|
request.logprobs if request.logprobs is not None else 0
|
523
528
|
)
|
@@ -924,7 +929,9 @@ def v1_chat_generate_request(
|
|
924
929
|
"repetition_penalty": request.repetition_penalty,
|
925
930
|
"regex": request.regex,
|
926
931
|
"n": request.n,
|
932
|
+
"no_stop_trim": request.no_stop_trim,
|
927
933
|
"ignore_eos": request.ignore_eos,
|
934
|
+
"skip_special_tokens": request.skip_special_tokens,
|
928
935
|
}
|
929
936
|
if request.response_format and request.response_format.type == "json_schema":
|
930
937
|
sampling_params["json_schema"] = convert_json_schema_to_str(
|
@@ -1162,7 +1169,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1162
1169
|
is_first = False
|
1163
1170
|
choice_data = ChatCompletionResponseStreamChoice(
|
1164
1171
|
index=index,
|
1165
|
-
delta=DeltaMessage(role="assistant"),
|
1172
|
+
delta=DeltaMessage(role="assistant", content=""),
|
1166
1173
|
finish_reason=(
|
1167
1174
|
finish_reason["type"] if finish_reason else ""
|
1168
1175
|
),
|
@@ -1277,7 +1284,7 @@ def v1_embedding_request(all_requests, tokenizer_manager):
|
|
1277
1284
|
else:
|
1278
1285
|
prompt_kwargs = {"input_ids": prompt}
|
1279
1286
|
else:
|
1280
|
-
if isinstance(prompts[0], str) or isinstance(
|
1287
|
+
if isinstance(prompts[0], str) or isinstance(propmts[0][0], str):
|
1281
1288
|
prompt_kwargs = {"text": prompts}
|
1282
1289
|
else:
|
1283
1290
|
prompt_kwargs = {"input_ids": prompts}
|
@@ -36,7 +36,7 @@ class ModelList(BaseModel):
|
|
36
36
|
"""Model list consists of model cards."""
|
37
37
|
|
38
38
|
object: str = "list"
|
39
|
-
data: List[ModelCard] =
|
39
|
+
data: List[ModelCard] = Field(default_factory=list)
|
40
40
|
|
41
41
|
|
42
42
|
class ErrorResponse(BaseModel):
|
@@ -143,7 +143,7 @@ class BatchResponse(BaseModel):
|
|
143
143
|
expired_at: Optional[int] = None
|
144
144
|
cancelling_at: Optional[int] = None
|
145
145
|
cancelled_at: Optional[int] = None
|
146
|
-
request_counts: dict =
|
146
|
+
request_counts: Optional[dict] = None
|
147
147
|
metadata: Optional[dict] = None
|
148
148
|
|
149
149
|
|
@@ -153,30 +153,31 @@ class CompletionRequest(BaseModel):
|
|
153
153
|
model: str
|
154
154
|
prompt: Union[List[int], List[List[int]], str, List[str]]
|
155
155
|
best_of: Optional[int] = None
|
156
|
-
echo:
|
157
|
-
frequency_penalty:
|
156
|
+
echo: bool = False
|
157
|
+
frequency_penalty: float = 0.0
|
158
158
|
logit_bias: Optional[Dict[str, float]] = None
|
159
159
|
logprobs: Optional[int] = None
|
160
|
-
max_tokens:
|
160
|
+
max_tokens: int = 16
|
161
161
|
n: int = 1
|
162
|
-
presence_penalty:
|
162
|
+
presence_penalty: float = 0.0
|
163
163
|
seed: Optional[int] = None
|
164
|
-
stop: Optional[Union[str, List[str]]] =
|
165
|
-
stream:
|
164
|
+
stop: Optional[Union[str, List[str]]] = None
|
165
|
+
stream: bool = False
|
166
166
|
stream_options: Optional[StreamOptions] = None
|
167
167
|
suffix: Optional[str] = None
|
168
|
-
temperature:
|
169
|
-
top_p:
|
168
|
+
temperature: float = 1.0
|
169
|
+
top_p: float = 1.0
|
170
170
|
user: Optional[str] = None
|
171
171
|
|
172
172
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
173
|
-
regex: Optional[str] = None
|
174
173
|
json_schema: Optional[str] = None
|
175
|
-
|
174
|
+
regex: Optional[str] = None
|
176
175
|
min_tokens: int = 0
|
177
|
-
repetition_penalty:
|
178
|
-
stop_token_ids: Optional[List[int]] =
|
179
|
-
no_stop_trim:
|
176
|
+
repetition_penalty: float = 1.0
|
177
|
+
stop_token_ids: Optional[List[int]] = None
|
178
|
+
no_stop_trim: bool = False
|
179
|
+
ignore_eos: bool = False
|
180
|
+
skip_special_tokens: bool = True
|
180
181
|
|
181
182
|
|
182
183
|
class CompletionResponseChoice(BaseModel):
|
@@ -259,28 +260,30 @@ class ChatCompletionRequest(BaseModel):
|
|
259
260
|
# https://platform.openai.com/docs/api-reference/chat/create
|
260
261
|
messages: List[ChatCompletionMessageParam]
|
261
262
|
model: str
|
262
|
-
frequency_penalty:
|
263
|
+
frequency_penalty: float = 0.0
|
263
264
|
logit_bias: Optional[Dict[str, float]] = None
|
264
|
-
logprobs:
|
265
|
+
logprobs: bool = False
|
265
266
|
top_logprobs: Optional[int] = None
|
266
267
|
max_tokens: Optional[int] = None
|
267
|
-
n:
|
268
|
-
presence_penalty:
|
268
|
+
n: int = 1
|
269
|
+
presence_penalty: float = 0.0
|
269
270
|
response_format: Optional[ResponseFormat] = None
|
270
271
|
seed: Optional[int] = None
|
271
|
-
stop: Optional[Union[str, List[str]]] =
|
272
|
-
stream:
|
272
|
+
stop: Optional[Union[str, List[str]]] = None
|
273
|
+
stream: bool = False
|
273
274
|
stream_options: Optional[StreamOptions] = None
|
274
|
-
temperature:
|
275
|
-
top_p:
|
275
|
+
temperature: float = 0.7
|
276
|
+
top_p: float = 1.0
|
276
277
|
user: Optional[str] = None
|
277
278
|
|
278
279
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
279
280
|
regex: Optional[str] = None
|
280
|
-
min_tokens:
|
281
|
-
repetition_penalty:
|
282
|
-
stop_token_ids: Optional[List[int]] =
|
281
|
+
min_tokens: int = 0
|
282
|
+
repetition_penalty: float = 1.0
|
283
|
+
stop_token_ids: Optional[List[int]] = None
|
284
|
+
no_stop_trim: bool = False
|
283
285
|
ignore_eos: bool = False
|
286
|
+
skip_special_tokens: bool = True
|
284
287
|
|
285
288
|
|
286
289
|
class ChatMessage(BaseModel):
|
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, List, Optional
|
|
6
6
|
import torch
|
7
7
|
|
8
8
|
import sglang.srt.sampling.penaltylib as penaltylib
|
9
|
-
from sglang.srt.constrained.grammar import Grammar
|
10
9
|
|
11
10
|
if TYPE_CHECKING:
|
12
11
|
from sglang.srt.managers.schedule_batch import ScheduleBatch
|
@@ -31,7 +30,7 @@ class SamplingBatchInfo:
|
|
31
30
|
logit_bias: torch.Tensor = None
|
32
31
|
vocab_mask: Optional[torch.Tensor] = None
|
33
32
|
|
34
|
-
grammars: Optional[List
|
33
|
+
grammars: Optional[List] = None
|
35
34
|
|
36
35
|
# Penalizer
|
37
36
|
penalizer_orchestrator: Optional[penaltylib.BatchedPenalizerOrchestrator] = None
|
@@ -146,7 +145,7 @@ class SamplingBatchInfo:
|
|
146
145
|
)
|
147
146
|
for i, grammar in enumerate(self.grammars):
|
148
147
|
if grammar is not None:
|
149
|
-
grammar.fill_vocab_mask(self.vocab_mask[i]
|
148
|
+
grammar.fill_vocab_mask(self.vocab_mask[i])
|
150
149
|
|
151
150
|
def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor):
|
152
151
|
if self.penalizer_orchestrator:
|
@@ -34,13 +34,13 @@ class SamplingParams:
|
|
34
34
|
frequency_penalty: float = 0.0,
|
35
35
|
presence_penalty: float = 0.0,
|
36
36
|
repetition_penalty: float = 1.0,
|
37
|
-
ignore_eos: bool = False,
|
38
|
-
skip_special_tokens: bool = True,
|
39
37
|
spaces_between_special_tokens: bool = True,
|
40
38
|
regex: Optional[str] = None,
|
41
39
|
n: int = 1,
|
42
40
|
json_schema: Optional[str] = None,
|
43
41
|
no_stop_trim: bool = False,
|
42
|
+
ignore_eos: bool = False,
|
43
|
+
skip_special_tokens: bool = True,
|
44
44
|
) -> None:
|
45
45
|
self.temperature = temperature
|
46
46
|
self.top_p = top_p
|
@@ -133,17 +133,3 @@ class SamplingParams:
|
|
133
133
|
else:
|
134
134
|
stop_str_max_len = max(stop_str_max_len, len(stop_str))
|
135
135
|
self.stop_str_max_len = stop_str_max_len
|
136
|
-
|
137
|
-
def to_srt_kwargs(self):
|
138
|
-
return {
|
139
|
-
"max_new_tokens": self.max_new_tokens,
|
140
|
-
"stop": self.stop_strs,
|
141
|
-
"stop_token_ids": list(self.stop_token_ids),
|
142
|
-
"temperature": self.temperature,
|
143
|
-
"top_p": self.top_p,
|
144
|
-
"top_k": self.top_k,
|
145
|
-
"frequency_penalty": self.frequency_penalty,
|
146
|
-
"presence_penalty": self.presence_penalty,
|
147
|
-
"ignore_eos": self.ignore_eos,
|
148
|
-
"regex": self.regex,
|
149
|
-
}
|