sglang 0.4.6.post4__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +6 -6
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +23 -15
- sglang/bench_serving.py +133 -57
- sglang/compile_deep_gemm.py +4 -4
- sglang/srt/configs/model_config.py +39 -28
- sglang/srt/conversation.py +1 -1
- sglang/srt/disaggregation/decode.py +122 -133
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +3 -13
- sglang/srt/disaggregation/kv_events.py +357 -0
- sglang/srt/disaggregation/mini_lb.py +57 -24
- sglang/srt/disaggregation/mooncake/conn.py +11 -2
- sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
- sglang/srt/disaggregation/nixl/conn.py +9 -19
- sglang/srt/disaggregation/prefill.py +126 -44
- sglang/srt/disaggregation/utils.py +116 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +5 -0
- sglang/srt/entrypoints/engine.py +28 -8
- sglang/srt/entrypoints/http_server.py +6 -4
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +250 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +157 -0
- sglang/srt/function_call/ebnf_composer.py +234 -0
- sglang/srt/function_call/function_call_parser.py +175 -0
- sglang/srt/function_call/llama32_detector.py +74 -0
- sglang/srt/function_call/mistral_detector.py +84 -0
- sglang/srt/function_call/pythonic_detector.py +163 -0
- sglang/srt/function_call/qwen25_detector.py +67 -0
- sglang/srt/function_call/utils.py +35 -0
- sglang/srt/hf_transformers_utils.py +46 -7
- sglang/srt/layers/attention/aiter_backend.py +513 -0
- sglang/srt/layers/attention/flashattention_backend.py +63 -17
- sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/triton_backend.py +3 -0
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +451 -0
- sglang/srt/layers/dp_attention.py +0 -10
- sglang/srt/layers/moe/cutlass_moe.py +207 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +104 -50
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
- sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
- sglang/srt/layers/moe/topk.py +66 -9
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +7 -2
- sglang/srt/layers/quantization/deep_gemm.py +5 -3
- sglang/srt/layers/quantization/fp8.py +90 -0
- sglang/srt/layers/quantization/fp8_utils.py +6 -0
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/lora/lora_manager.py +1 -3
- sglang/srt/managers/deepseek_eplb.py +278 -0
- sglang/srt/managers/eplb_manager.py +55 -0
- sglang/srt/managers/expert_distribution.py +704 -56
- sglang/srt/managers/expert_location.py +394 -0
- sglang/srt/managers/expert_location_dispatch.py +91 -0
- sglang/srt/managers/io_struct.py +16 -3
- sglang/srt/managers/mm_utils.py +293 -139
- sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
- sglang/srt/managers/multimodal_processors/internvl.py +14 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
- sglang/srt/managers/schedule_batch.py +49 -21
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +92 -50
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +99 -24
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +2 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +20 -9
- sglang/srt/model_executor/expert_location_updater.py +422 -0
- sglang/srt/model_executor/forward_batch_info.py +4 -0
- sglang/srt/model_executor/model_runner.py +144 -54
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_v2.py +297 -343
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_mm.py +70 -33
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +5 -12
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/qwen2.py +95 -26
- sglang/srt/models/qwen2_5_vl.py +8 -0
- sglang/srt/models/qwen2_moe.py +330 -60
- sglang/srt/models/qwen2_vl.py +6 -0
- sglang/srt/models/qwen3.py +52 -10
- sglang/srt/models/qwen3_moe.py +411 -48
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/openai_api/adapter.py +28 -16
- sglang/srt/openai_api/protocol.py +6 -0
- sglang/srt/operations.py +154 -0
- sglang/srt/operations_strategy.py +31 -0
- sglang/srt/server_args.py +134 -24
- sglang/srt/speculative/eagle_utils.py +131 -0
- sglang/srt/speculative/eagle_worker.py +47 -2
- sglang/srt/utils.py +68 -12
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_utils.py +2 -36
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +20 -11
- {sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +128 -102
- {sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
import asyncio
|
2
2
|
import math
|
3
|
-
|
3
|
+
import re
|
4
|
+
from typing import Dict, List, Union
|
4
5
|
|
5
6
|
import torch
|
6
7
|
from PIL import Image
|
@@ -23,7 +24,12 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
23
24
|
|
24
25
|
def __init__(self, hf_config, server_args, _processor):
|
25
26
|
super().__init__(hf_config, server_args, _processor)
|
27
|
+
# The single, pre-expanded image token.
|
26
28
|
self.IMAGE_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>"
|
29
|
+
# The regex that matches expanded image tokens.
|
30
|
+
self.IMAGE_TOKEN_REGEX = re.compile(
|
31
|
+
r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
|
32
|
+
)
|
27
33
|
self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
|
28
34
|
self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
|
29
35
|
self.image_token_id = hf_config.image_token_id
|
@@ -38,7 +44,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
38
44
|
|
39
45
|
async def process_mm_data_async(
|
40
46
|
self,
|
41
|
-
image_data: List[Union[str, bytes]],
|
47
|
+
image_data: List[Union[str, bytes, Dict]],
|
42
48
|
input_text,
|
43
49
|
request_obj,
|
44
50
|
max_req_input_len,
|
@@ -48,11 +54,13 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
48
54
|
if isinstance(image_data, str):
|
49
55
|
image_data = [image_data]
|
50
56
|
|
51
|
-
image_token = self.IMAGE_TOKEN
|
52
57
|
base_output = self.load_mm_data(
|
53
58
|
prompt=input_text,
|
54
59
|
image_data=image_data,
|
55
|
-
multimodal_tokens=MultimodalSpecialTokens(
|
60
|
+
multimodal_tokens=MultimodalSpecialTokens(
|
61
|
+
image_token=self.IMAGE_TOKEN,
|
62
|
+
image_token_regex=self.IMAGE_TOKEN_REGEX,
|
63
|
+
),
|
56
64
|
max_req_input_len=max_req_input_len,
|
57
65
|
)
|
58
66
|
|
@@ -117,26 +125,60 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
117
125
|
async def resize_image_async(image):
|
118
126
|
return resize_image(image)
|
119
127
|
|
120
|
-
|
128
|
+
images_are_preprocessed = self.mm_inputs_are_preprocessed(base_output.images)
|
129
|
+
if base_output.images and not images_are_preprocessed:
|
121
130
|
resize_tasks = [resize_image_async(image) for image in base_output.images]
|
122
131
|
base_output.images = await asyncio.gather(*resize_tasks)
|
123
132
|
|
124
133
|
ret = self.process_mm_data(
|
125
134
|
input_text=base_output.input_text,
|
126
|
-
images=base_output.images,
|
135
|
+
images=None if images_are_preprocessed else base_output.images,
|
127
136
|
)
|
128
|
-
|
137
|
+
input_ids = ret["input_ids"].flatten().tolist()
|
138
|
+
image_offsets = self.get_mm_items_offset(
|
139
|
+
input_ids=ret["input_ids"].flatten(), mm_token_id=self.image_token_id
|
140
|
+
)
|
141
|
+
image_grid_thw = None
|
142
|
+
video_grid_thw = None # TODO
|
129
143
|
items = []
|
130
144
|
|
131
|
-
|
132
|
-
|
145
|
+
if base_output.images:
|
146
|
+
if images_are_preprocessed:
|
147
|
+
image_grid_thw = torch.concat(
|
148
|
+
[
|
149
|
+
torch.as_tensor(item.image_grid_thws)
|
150
|
+
for item in base_output.images
|
151
|
+
]
|
152
|
+
)
|
153
|
+
all_pixel_values = [
|
154
|
+
item.pixel_values
|
155
|
+
for item in base_output.images
|
156
|
+
if item.pixel_values is not None
|
157
|
+
]
|
158
|
+
all_precomputed_features = [
|
159
|
+
item.precomputed_features
|
160
|
+
for item in base_output.images
|
161
|
+
if item.precomputed_features is not None
|
162
|
+
]
|
163
|
+
pixel_values = (
|
164
|
+
torch.concat(all_pixel_values) if all_pixel_values else None
|
165
|
+
)
|
166
|
+
precomputed_features = (
|
167
|
+
torch.concat(all_precomputed_features)
|
168
|
+
if all_precomputed_features
|
169
|
+
else None
|
170
|
+
)
|
171
|
+
else:
|
172
|
+
image_grid_thw = ret["image_grid_thw"]
|
173
|
+
pixel_values = ret["pixel_values"]
|
174
|
+
precomputed_features = None
|
133
175
|
items += [
|
134
176
|
MultimodalDataItem(
|
135
|
-
pixel_values=
|
136
|
-
image_grid_thws=
|
137
|
-
|
138
|
-
|
139
|
-
|
177
|
+
pixel_values=pixel_values,
|
178
|
+
image_grid_thws=image_grid_thw,
|
179
|
+
video_grid_thws=video_grid_thw,
|
180
|
+
precomputed_features=precomputed_features,
|
181
|
+
image_offsets=image_offsets,
|
140
182
|
modality=Modality.IMAGE,
|
141
183
|
)
|
142
184
|
]
|
@@ -151,8 +193,8 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
151
193
|
self.hf_config.vision_config, "tokens_per_second", None
|
152
194
|
),
|
153
195
|
input_ids=torch.tensor(input_ids).unsqueeze(0),
|
154
|
-
image_grid_thw=
|
155
|
-
video_grid_thw=
|
196
|
+
image_grid_thw=image_grid_thw,
|
197
|
+
video_grid_thw=video_grid_thw,
|
156
198
|
second_per_grid_ts=ret.get("second_per_grid_ts", None),
|
157
199
|
)
|
158
200
|
mrope_positions = mrope_positions.squeeze(1)
|
@@ -48,7 +48,10 @@ from sglang.global_config import global_config
|
|
48
48
|
from sglang.srt.configs.model_config import ModelConfig
|
49
49
|
from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
|
50
50
|
from sglang.srt.disaggregation.base import BaseKVSender
|
51
|
-
from sglang.srt.disaggregation.
|
51
|
+
from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
|
52
|
+
ScheduleBatchDisaggregationDecodeMixin,
|
53
|
+
)
|
54
|
+
from sglang.srt.layers.multimodal import gpu_tensor_hash
|
52
55
|
from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
|
53
56
|
from sglang.srt.mem_cache.chunk_cache import ChunkCache
|
54
57
|
from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPoolAllocator
|
@@ -77,16 +80,19 @@ global_server_args_dict = {
|
|
77
80
|
"enable_dp_attention": ServerArgs.enable_dp_attention,
|
78
81
|
"enable_dp_lm_head": ServerArgs.enable_dp_lm_head,
|
79
82
|
"enable_ep_moe": ServerArgs.enable_ep_moe,
|
83
|
+
"deepep_config": ServerArgs.deepep_config,
|
80
84
|
"enable_nan_detection": ServerArgs.enable_nan_detection,
|
81
85
|
"flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
|
82
86
|
"max_micro_batch_size": ServerArgs.max_micro_batch_size,
|
83
87
|
"moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
|
88
|
+
"ep_dispatch_algorithm": ServerArgs.ep_dispatch_algorithm,
|
84
89
|
"n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
|
85
90
|
"sampling_backend": ServerArgs.sampling_backend,
|
86
91
|
"speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
|
87
92
|
"speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
|
88
93
|
"torchao_config": ServerArgs.torchao_config,
|
89
94
|
"triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
|
95
|
+
"ep_num_redundant_experts": ServerArgs.ep_num_redundant_experts,
|
90
96
|
}
|
91
97
|
|
92
98
|
logger = logging.getLogger(__name__)
|
@@ -177,10 +183,10 @@ class MultimodalDataItem:
|
|
177
183
|
image_offsets: Optional[list] = None
|
178
184
|
|
179
185
|
# the real data, pixel_values or audio_features
|
180
|
-
# data: Union[List[torch.Tensor], List[np.
|
181
|
-
pixel_values: Union[torch.Tensor, np.
|
182
|
-
image_grid_thws: Union[torch.Tensor, np.
|
183
|
-
video_grid_thws: Union[torch.Tensor, np.
|
186
|
+
# data: Union[List[torch.Tensor], List[np.ndarray]]
|
187
|
+
pixel_values: Union[torch.Tensor, np.ndarray] = None
|
188
|
+
image_grid_thws: Union[torch.Tensor, np.ndarray] = None
|
189
|
+
video_grid_thws: Union[torch.Tensor, np.ndarray] = None
|
184
190
|
|
185
191
|
image_emb_mask: Optional[torch.Tensor] = None
|
186
192
|
image_spatial_crop: Optional[torch.Tensor] = None
|
@@ -189,8 +195,11 @@ class MultimodalDataItem:
|
|
189
195
|
# [num_images, (n, w, h)]
|
190
196
|
tgt_size: Tuple[int, int] = None
|
191
197
|
|
192
|
-
audio_features: Union[torch.Tensor, np.
|
198
|
+
audio_features: Union[torch.Tensor, np.ndarray] = None
|
193
199
|
audio_feature_lens: Optional[List[torch.Tensor]] = None
|
200
|
+
audio_offsets: Optional[List[Tuple[int, int]]] = None
|
201
|
+
|
202
|
+
precomputed_features: Optional[Union[torch.Tensor, np.ndarray]] = None
|
194
203
|
|
195
204
|
@staticmethod
|
196
205
|
def is_empty_list(l):
|
@@ -219,7 +228,8 @@ class MultimodalDataItem:
|
|
219
228
|
for x in tensor_list
|
220
229
|
]
|
221
230
|
tensor = torch.concat(tensor_list)
|
222
|
-
|
231
|
+
if tensor.is_cuda:
|
232
|
+
return gpu_tensor_hash(tensor)
|
223
233
|
tensor = tensor.detach().contiguous()
|
224
234
|
|
225
235
|
if tensor.dtype == torch.bfloat16:
|
@@ -249,7 +259,9 @@ class MultimodalDataItem:
|
|
249
259
|
return tensor_hash([f])
|
250
260
|
return data_hash(f)
|
251
261
|
|
252
|
-
if self.
|
262
|
+
if self.precomputed_features is not None:
|
263
|
+
self.hash = hash_feature(self.precomputed_features)
|
264
|
+
elif self.is_audio():
|
253
265
|
self.hash = hash_feature(self.audio_features)
|
254
266
|
else:
|
255
267
|
self.hash = hash_feature(self.pixel_values)
|
@@ -258,19 +270,24 @@ class MultimodalDataItem:
|
|
258
270
|
self.pad_value = self.hash % (1 << 30)
|
259
271
|
|
260
272
|
def is_audio(self):
|
261
|
-
return (
|
262
|
-
self.
|
263
|
-
|
273
|
+
return (self.modality == Modality.AUDIO) and (
|
274
|
+
self.precomputed_features is not None
|
275
|
+
or not MultimodalDataItem.is_empty_list(self.audio_features)
|
276
|
+
)
|
264
277
|
|
265
278
|
def is_image(self):
|
266
279
|
return (
|
267
280
|
self.modality == Modality.IMAGE or self.modality == Modality.MULTI_IMAGES
|
268
|
-
) and
|
281
|
+
) and (
|
282
|
+
self.precomputed_features is not None
|
283
|
+
or not MultimodalDataItem.is_empty_list(self.pixel_values)
|
284
|
+
)
|
269
285
|
|
270
286
|
def is_video(self):
|
271
|
-
return (
|
272
|
-
self.
|
273
|
-
|
287
|
+
return (self.modality == Modality.VIDEO) and (
|
288
|
+
self.precomputed_features is not None
|
289
|
+
or not MultimodalDataItem.is_empty_list(self.pixel_values)
|
290
|
+
)
|
274
291
|
|
275
292
|
def is_valid(self) -> bool:
|
276
293
|
return self.is_image() or self.is_video() or self.is_audio()
|
@@ -279,6 +296,16 @@ class MultimodalDataItem:
|
|
279
296
|
...
|
280
297
|
# TODO
|
281
298
|
|
299
|
+
@staticmethod
|
300
|
+
def from_dict(obj: dict):
|
301
|
+
kwargs = dict(obj)
|
302
|
+
modality = kwargs.pop("modality")
|
303
|
+
if isinstance(modality, str):
|
304
|
+
modality = Modality[modality]
|
305
|
+
ret = MultimodalDataItem(modality=modality, **kwargs)
|
306
|
+
ret.validate()
|
307
|
+
return ret
|
308
|
+
|
282
309
|
|
283
310
|
@dataclasses.dataclass
|
284
311
|
class MultimodalInputs:
|
@@ -304,8 +331,9 @@ class MultimodalInputs:
|
|
304
331
|
video_token_id: Optional[int] = None
|
305
332
|
|
306
333
|
# audio
|
307
|
-
|
308
|
-
|
334
|
+
audio_token_id: Optional[int] = None
|
335
|
+
audio_start_id: Optional[int] = None
|
336
|
+
audio_end_id: Optional[int] = None
|
309
337
|
|
310
338
|
@staticmethod
|
311
339
|
def from_dict(obj: dict):
|
@@ -329,6 +357,7 @@ class MultimodalInputs:
|
|
329
357
|
"slice_end_id",
|
330
358
|
"audio_start_id",
|
331
359
|
"audio_end_id",
|
360
|
+
"audio_token_id",
|
332
361
|
]
|
333
362
|
for arg in optional_args:
|
334
363
|
if arg in obj:
|
@@ -578,9 +607,6 @@ class Req:
|
|
578
607
|
self.tmp_end_idx: int = -1
|
579
608
|
self.metadata_buffer_index: int = -1
|
580
609
|
|
581
|
-
# The first output_id transferred from prefill instance.
|
582
|
-
self.transferred_output_id: Optional[int] = None
|
583
|
-
|
584
610
|
@property
|
585
611
|
def seqlen(self):
|
586
612
|
return len(self.origin_input_ids) + len(self.output_ids)
|
@@ -1069,7 +1095,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1069
1095
|
else:
|
1070
1096
|
self.encoder_out_cache_loc = torch.cat(encoder_out_cache_loc)
|
1071
1097
|
|
1072
|
-
assert
|
1098
|
+
assert (
|
1099
|
+
len(self.out_cache_loc) == self.extend_num_tokens
|
1100
|
+
), f"Expected {len(self.out_cache_loc)}, got {self.extend_num_tokens}"
|
1073
1101
|
|
1074
1102
|
def prepare_for_extend(self):
|
1075
1103
|
self.forward_mode = ForwardMode.EXTEND
|
@@ -22,11 +22,7 @@ from typing import Dict, List, Optional, Set, Union
|
|
22
22
|
|
23
23
|
import torch
|
24
24
|
|
25
|
-
from sglang.srt.managers.schedule_batch import
|
26
|
-
Req,
|
27
|
-
ScheduleBatch,
|
28
|
-
global_server_args_dict,
|
29
|
-
)
|
25
|
+
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
30
26
|
from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
|
31
27
|
from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
|
32
28
|
from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
|
@@ -468,6 +464,9 @@ class PrefillAdder:
|
|
468
464
|
return AddReqResult.OTHER
|
469
465
|
|
470
466
|
with self._lock_node(req.last_node):
|
467
|
+
if total_tokens > self.rem_total_tokens:
|
468
|
+
return AddReqResult.NO_TOKEN
|
469
|
+
|
471
470
|
if (
|
472
471
|
enable_hierarchical_cache
|
473
472
|
and req.last_node_global is not None
|