sglang 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_latency.py +2 -1
- sglang/lang/chat_template.py +17 -0
- sglang/launch_server_llavavid.py +1 -1
- sglang/srt/configs/__init__.py +3 -0
- sglang/srt/configs/model_config.py +27 -2
- sglang/srt/configs/qwen2vl.py +133 -0
- sglang/srt/constrained/fsm_cache.py +10 -3
- sglang/srt/conversation.py +27 -0
- sglang/srt/hf_transformers_utils.py +16 -1
- sglang/srt/layers/attention/__init__.py +16 -5
- sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
- sglang/srt/layers/attention/flashinfer_backend.py +174 -54
- sglang/srt/layers/attention/triton_backend.py +22 -6
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
- sglang/srt/layers/linear.py +89 -63
- sglang/srt/layers/logits_processor.py +5 -5
- sglang/srt/layers/rotary_embedding.py +112 -0
- sglang/srt/layers/sampler.py +51 -39
- sglang/srt/lora/lora.py +3 -1
- sglang/srt/managers/data_parallel_controller.py +1 -1
- sglang/srt/managers/detokenizer_manager.py +4 -0
- sglang/srt/managers/image_processor.py +186 -13
- sglang/srt/managers/io_struct.py +10 -0
- sglang/srt/managers/schedule_batch.py +238 -68
- sglang/srt/managers/scheduler.py +69 -50
- sglang/srt/managers/tokenizer_manager.py +24 -4
- sglang/srt/managers/tp_worker.py +26 -111
- sglang/srt/managers/tp_worker_overlap_thread.py +209 -0
- sglang/srt/mem_cache/memory_pool.py +56 -10
- sglang/srt/mem_cache/radix_cache.py +4 -3
- sglang/srt/model_executor/cuda_graph_runner.py +87 -28
- sglang/srt/model_executor/forward_batch_info.py +83 -3
- sglang/srt/model_executor/model_runner.py +32 -11
- sglang/srt/models/chatglm.py +3 -3
- sglang/srt/models/deepseek_v2.py +2 -2
- sglang/srt/models/mllama.py +1004 -0
- sglang/srt/models/qwen2_vl.py +724 -0
- sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
- sglang/srt/sampling/sampling_batch_info.py +13 -3
- sglang/srt/sampling/sampling_params.py +5 -7
- sglang/srt/server.py +12 -0
- sglang/srt/server_args.py +10 -0
- sglang/srt/utils.py +22 -0
- sglang/test/run_eval.py +2 -0
- sglang/test/runners.py +20 -1
- sglang/test/srt/sampling/penaltylib/utils.py +1 -0
- sglang/test/test_utils.py +100 -3
- sglang/version.py +1 -1
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +17 -18
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +53 -48
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0
@@ -25,6 +25,8 @@ ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
|
|
25
25
|
- ScheduleBatch is managed by `scheduler.py::Scheduler`.
|
26
26
|
It contains high-level scheduling data. Most of the data is on the CPU.
|
27
27
|
- ModelWorkerBatch is managed by `tp_worker.py::TpModelWorker`.
|
28
|
+
It is a subset of `ScheduleBatch` that only contains data related to the model forward on GPU.
|
29
|
+
It will be transformed from CPU scheduler to GPU model runner.
|
28
30
|
- ForwardBatch is managed by `model_runner.py::ModelRunner`.
|
29
31
|
It contains low-level tensor data. Most of the data consists of GPU tensors.
|
30
32
|
"""
|
@@ -33,9 +35,10 @@ from dataclasses import dataclass
|
|
33
35
|
from enum import IntEnum, auto
|
34
36
|
from typing import TYPE_CHECKING, List, Optional
|
35
37
|
|
36
|
-
import numpy as np
|
37
38
|
import torch
|
38
39
|
|
40
|
+
from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
|
41
|
+
|
39
42
|
if TYPE_CHECKING:
|
40
43
|
from sglang.srt.layers.attention import AttentionBackend
|
41
44
|
from sglang.srt.managers.schedule_batch import ImageInputs, ModelWorkerBatch
|
@@ -84,6 +87,9 @@ class ForwardBatch:
|
|
84
87
|
# The indices of output tokens in the token_to_kv_pool
|
85
88
|
out_cache_loc: torch.Tensor
|
86
89
|
|
90
|
+
# The sum of all sequence lengths
|
91
|
+
seq_lens_sum: int
|
92
|
+
|
87
93
|
# For logprob
|
88
94
|
return_logprob: bool = False
|
89
95
|
top_logprobs_nums: Optional[List[int]] = None
|
@@ -92,6 +98,7 @@ class ForwardBatch:
|
|
92
98
|
positions: torch.Tensor = None
|
93
99
|
|
94
100
|
# For extend
|
101
|
+
extend_num_tokens: Optional[int] = None
|
95
102
|
extend_seq_lens: Optional[torch.Tensor] = None
|
96
103
|
extend_prefix_lens: Optional[torch.Tensor] = None
|
97
104
|
extend_start_loc: Optional[torch.Tensor] = None
|
@@ -101,6 +108,12 @@ class ForwardBatch:
|
|
101
108
|
# For multimodal
|
102
109
|
image_inputs: Optional[List[ImageInputs]] = None
|
103
110
|
|
111
|
+
# Encoder-decoder
|
112
|
+
encoder_cached: Optional[List[bool]] = None
|
113
|
+
encoder_lens: Optional[torch.Tensor] = None
|
114
|
+
encoder_lens_cpu: Optional[List[int]] = None
|
115
|
+
encoder_out_cache_loc: Optional[torch.Tensor] = None
|
116
|
+
|
104
117
|
# For LoRA
|
105
118
|
lora_paths: Optional[List[str]] = None
|
106
119
|
|
@@ -112,14 +125,71 @@ class ForwardBatch:
|
|
112
125
|
token_to_kv_pool: BaseTokenToKVPool = None
|
113
126
|
attn_backend: AttentionBackend = None
|
114
127
|
|
128
|
+
# For Qwen2-VL
|
129
|
+
mrope_positions: torch.Tensor = None
|
130
|
+
|
131
|
+
def compute_mrope_positions(
|
132
|
+
self, model_runner: ModelRunner, batch: ModelWorkerBatch
|
133
|
+
):
|
134
|
+
device = model_runner.device
|
135
|
+
hf_config = model_runner.model_config.hf_config
|
136
|
+
mrope_positions_list = [None] * self.seq_lens.shape[0]
|
137
|
+
if self.forward_mode.is_decode():
|
138
|
+
for i, _ in enumerate(mrope_positions_list):
|
139
|
+
mrope_positions_list[i] = MRotaryEmbedding.get_next_input_positions(
|
140
|
+
batch.mrope_positions_delta[i][0],
|
141
|
+
int(self.seq_lens[i]) - 1,
|
142
|
+
int(self.seq_lens[i]),
|
143
|
+
)
|
144
|
+
elif self.forward_mode.is_extend():
|
145
|
+
extend_start_loc_cpu = self.extend_start_loc.cpu().numpy()
|
146
|
+
for i, image_inputs in enumerate(batch.image_inputs):
|
147
|
+
extend_start_loc, extend_seq_len, extend_prefix_len = (
|
148
|
+
extend_start_loc_cpu[i],
|
149
|
+
batch.extend_seq_lens[i],
|
150
|
+
batch.extend_prefix_lens[i],
|
151
|
+
)
|
152
|
+
if image_inputs is None:
|
153
|
+
# text only
|
154
|
+
mrope_positions = [
|
155
|
+
[
|
156
|
+
pos
|
157
|
+
for pos in range(
|
158
|
+
extend_prefix_len, extend_prefix_len + extend_seq_len
|
159
|
+
)
|
160
|
+
]
|
161
|
+
] * 3
|
162
|
+
mrope_position_delta = 0
|
163
|
+
else:
|
164
|
+
# TODO: current qwen2-vl do not support radix cache since mrope position calculation
|
165
|
+
mrope_positions, mrope_position_delta = (
|
166
|
+
MRotaryEmbedding.get_input_positions(
|
167
|
+
input_tokens=self.input_ids[
|
168
|
+
extend_start_loc : extend_start_loc + extend_seq_len
|
169
|
+
],
|
170
|
+
image_grid_thw=image_inputs.image_grid_thws,
|
171
|
+
vision_start_token_id=hf_config.vision_start_token_id,
|
172
|
+
spatial_merge_size=hf_config.vision_config.spatial_merge_size,
|
173
|
+
context_len=0,
|
174
|
+
)
|
175
|
+
)
|
176
|
+
mrope_positions_list[i] = mrope_positions
|
177
|
+
batch.mrope_positions_delta[i].append(mrope_position_delta)
|
178
|
+
|
179
|
+
self.mrope_positions = torch.concat(
|
180
|
+
[torch.tensor(pos, device=device) for pos in mrope_positions_list],
|
181
|
+
axis=1,
|
182
|
+
)
|
183
|
+
self.mrope_positions = self.mrope_positions.to(torch.int64)
|
184
|
+
|
115
185
|
@classmethod
|
116
186
|
def init_new(
|
117
187
|
cls,
|
118
188
|
batch: ModelWorkerBatch,
|
119
189
|
model_runner: ModelRunner,
|
120
190
|
):
|
121
|
-
device = model_runner.device
|
122
191
|
|
192
|
+
device = model_runner.device
|
123
193
|
ret = cls(
|
124
194
|
forward_mode=batch.forward_mode,
|
125
195
|
batch_size=len(batch.seq_lens),
|
@@ -127,6 +197,12 @@ class ForwardBatch:
|
|
127
197
|
req_pool_indices=batch.req_pool_indices,
|
128
198
|
seq_lens=batch.seq_lens,
|
129
199
|
out_cache_loc=batch.out_cache_loc,
|
200
|
+
image_inputs=batch.image_inputs,
|
201
|
+
encoder_cached=batch.encoder_cached,
|
202
|
+
encoder_lens=batch.encoder_lens,
|
203
|
+
encoder_lens_cpu=batch.encoder_lens_cpu,
|
204
|
+
encoder_out_cache_loc=batch.encoder_out_cache_loc,
|
205
|
+
seq_lens_sum=batch.seq_lens_sum,
|
130
206
|
return_logprob=batch.return_logprob,
|
131
207
|
top_logprobs_nums=batch.top_logprobs_nums,
|
132
208
|
lora_paths=batch.lora_paths,
|
@@ -144,10 +220,11 @@ class ForwardBatch:
|
|
144
220
|
],
|
145
221
|
axis=0,
|
146
222
|
)
|
147
|
-
ret.
|
223
|
+
ret.extend_num_tokens = batch.extend_num_tokens
|
148
224
|
ret.extend_seq_lens = torch.tensor(
|
149
225
|
batch.extend_seq_lens, dtype=torch.int32
|
150
226
|
).to(device, non_blocking=True)
|
227
|
+
|
151
228
|
ret.extend_prefix_lens = torch.tensor(
|
152
229
|
batch.extend_prefix_lens, dtype=torch.int32
|
153
230
|
).to(device, non_blocking=True)
|
@@ -156,6 +233,9 @@ class ForwardBatch:
|
|
156
233
|
ret.extend_seq_lens_cpu = batch.extend_seq_lens
|
157
234
|
ret.extend_logprob_start_lens_cpu = batch.extend_logprob_start_lens
|
158
235
|
|
236
|
+
if model_runner.model_is_mrope:
|
237
|
+
ret.compute_mrope_positions(model_runner, batch)
|
238
|
+
|
159
239
|
# Init attention information
|
160
240
|
ret.req_to_token_pool = model_runner.req_to_token_pool
|
161
241
|
ret.token_to_kv_pool = model_runner.token_to_kv_pool
|
@@ -59,8 +59,11 @@ from sglang.srt.server_args import ServerArgs
|
|
59
59
|
from sglang.srt.utils import (
|
60
60
|
enable_show_time_cost,
|
61
61
|
get_available_gpu_memory,
|
62
|
+
is_attention_free_model,
|
63
|
+
is_embedding_model,
|
62
64
|
is_generation_model,
|
63
65
|
is_multimodal_model,
|
66
|
+
model_has_inner_state,
|
64
67
|
monkey_patch_vllm_dummy_weight_loader,
|
65
68
|
monkey_patch_vllm_p2p_access_check,
|
66
69
|
)
|
@@ -117,11 +120,16 @@ class ModelRunner:
|
|
117
120
|
)
|
118
121
|
|
119
122
|
if self.is_multimodal_model:
|
120
|
-
logger.
|
123
|
+
logger.warning(
|
121
124
|
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
|
122
125
|
)
|
123
126
|
server_args.chunked_prefill_size = None
|
124
127
|
server_args.mem_fraction_static *= 0.95
|
128
|
+
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
|
129
|
+
if self.model_config.hf_config.architectures == [
|
130
|
+
"Qwen2VLForConditionalGeneration"
|
131
|
+
]:
|
132
|
+
server_args.disable_radix_cache = True
|
125
133
|
|
126
134
|
# Global vars
|
127
135
|
if server_args.show_time_cost:
|
@@ -262,7 +270,6 @@ class ModelRunner:
|
|
262
270
|
if hasattr(self.model, "get_attention_sliding_window_size")
|
263
271
|
else None
|
264
272
|
)
|
265
|
-
self.has_cross_attention = getattr(self.model, "has_cross_attention", False)
|
266
273
|
self.is_generation = is_generation_model(
|
267
274
|
self.model_config.hf_config.architectures, self.server_args.is_embedding
|
268
275
|
)
|
@@ -316,11 +323,13 @@ class ModelRunner:
|
|
316
323
|
|
317
324
|
def get_weight_iter(config):
|
318
325
|
iter = loader._get_weights_iterator(
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
326
|
+
DefaultModelLoader.Source(
|
327
|
+
config.model,
|
328
|
+
revision=config.revision,
|
329
|
+
fall_back_to_pt=getattr(
|
330
|
+
self.model, "fall_back_to_pt_during_load", True
|
331
|
+
),
|
332
|
+
)
|
324
333
|
)
|
325
334
|
return iter
|
326
335
|
|
@@ -444,6 +453,7 @@ class ModelRunner:
|
|
444
453
|
size=max_num_reqs + 1,
|
445
454
|
max_context_len=self.model_config.context_len + 4,
|
446
455
|
device=self.device,
|
456
|
+
use_records=False,
|
447
457
|
)
|
448
458
|
if (
|
449
459
|
self.model_config.attention_arch == AttentionArch.MLA
|
@@ -499,7 +509,7 @@ class ModelRunner:
|
|
499
509
|
"Window attention is not supported in the triton attention backend. "
|
500
510
|
"Please use `--attention-backend flashinfer`."
|
501
511
|
)
|
502
|
-
assert not self.
|
512
|
+
assert not self.model_config.is_encoder_decoder, (
|
503
513
|
"Cross attention is not supported in the triton attention backend. "
|
504
514
|
"Please use `--attention-backend flashinfer`."
|
505
515
|
)
|
@@ -547,9 +557,7 @@ class ModelRunner:
|
|
547
557
|
self.cuda_graph_runner = CudaGraphRunner(self)
|
548
558
|
|
549
559
|
def forward_decode(self, forward_batch: ForwardBatch):
|
550
|
-
if self.cuda_graph_runner and self.cuda_graph_runner.can_run(
|
551
|
-
forward_batch.batch_size
|
552
|
-
):
|
560
|
+
if self.cuda_graph_runner and self.cuda_graph_runner.can_run(forward_batch):
|
553
561
|
return self.cuda_graph_runner.replay(forward_batch)
|
554
562
|
|
555
563
|
forward_batch.positions = (forward_batch.seq_lens - 1).to(torch.int64)
|
@@ -617,6 +625,15 @@ class ModelRunner:
|
|
617
625
|
|
618
626
|
return logits
|
619
627
|
|
628
|
+
@property
|
629
|
+
def model_is_mrope(self) -> bool:
|
630
|
+
"""Detect if the model has "mrope" rope_scaling type.
|
631
|
+
mrope requires keep "rope_deltas" between prompt and decoding phases."""
|
632
|
+
rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {})
|
633
|
+
if rope_scaling is None:
|
634
|
+
return False
|
635
|
+
return rope_scaling.get("type", None) == "mrope"
|
636
|
+
|
620
637
|
|
621
638
|
@lru_cache()
|
622
639
|
def import_model_classes():
|
@@ -662,3 +679,7 @@ def load_model_cls_srt(model_arch: str) -> Optional[Type[nn.Module]]:
|
|
662
679
|
|
663
680
|
# Monkey patch model loader
|
664
681
|
setattr(ModelRegistry, "_try_load_model_cls", load_model_cls_srt)
|
682
|
+
setattr(ModelRegistry, "is_multimodal_model", is_multimodal_model)
|
683
|
+
setattr(ModelRegistry, "is_attention_free_model", is_attention_free_model)
|
684
|
+
setattr(ModelRegistry, "model_has_inner_state", model_has_inner_state)
|
685
|
+
setattr(ModelRegistry, "is_embedding_model", is_embedding_model)
|
sglang/srt/models/chatglm.py
CHANGED
@@ -303,7 +303,7 @@ class GLMTransformer(nn.Module):
|
|
303
303
|
return hidden_states
|
304
304
|
|
305
305
|
|
306
|
-
class
|
306
|
+
class ChatGLMM(nn.Module):
|
307
307
|
def __init__(
|
308
308
|
self,
|
309
309
|
config,
|
@@ -366,7 +366,7 @@ class ChatGLMForCausalLM(nn.Module):
|
|
366
366
|
self.config: ChatGLMConfig = config
|
367
367
|
self.quant_config = quant_config
|
368
368
|
self.max_position_embeddings = getattr(config, "max_sequence_length", 8192)
|
369
|
-
self.transformer =
|
369
|
+
self.transformer = ChatGLMM(config, cache_config, quant_config)
|
370
370
|
self.lm_head = self.transformer.output_layer
|
371
371
|
self.logits_processor = LogitsProcessor(config)
|
372
372
|
|
@@ -401,4 +401,4 @@ class ChatGLMModel(ChatGLMForCausalLM):
|
|
401
401
|
pass
|
402
402
|
|
403
403
|
|
404
|
-
EntryClass = [
|
404
|
+
EntryClass = [ChatGLMModel]
|
sglang/srt/models/deepseek_v2.py
CHANGED
@@ -250,7 +250,7 @@ class DeepseekV2Attention(nn.Module):
|
|
250
250
|
bias=False,
|
251
251
|
quant_config=quant_config,
|
252
252
|
)
|
253
|
-
rope_scaling["
|
253
|
+
rope_scaling["rope_type"] = "deepseek_yarn"
|
254
254
|
self.rotary_emb = get_rope(
|
255
255
|
qk_rope_head_dim,
|
256
256
|
rotary_dim=qk_rope_head_dim,
|
@@ -398,7 +398,7 @@ class DeepseekV2AttentionMLA(nn.Module):
|
|
398
398
|
bias=False,
|
399
399
|
quant_config=quant_config,
|
400
400
|
)
|
401
|
-
rope_scaling["
|
401
|
+
rope_scaling["rope_type"] = "deepseek_yarn"
|
402
402
|
self.rotary_emb = get_rope(
|
403
403
|
qk_rope_head_dim,
|
404
404
|
rotary_dim=qk_rope_head_dim,
|