sglang 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +21 -23
- sglang/api.py +2 -7
- sglang/bench_offline_throughput.py +41 -27
- sglang/bench_one_batch.py +60 -4
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +83 -71
- sglang/lang/backend/runtime_endpoint.py +183 -4
- sglang/lang/chat_template.py +46 -4
- sglang/launch_server.py +1 -1
- sglang/srt/_custom_ops.py +80 -42
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +1 -0
- sglang/srt/constrained/base_grammar_backend.py +21 -0
- sglang/srt/constrained/xgrammar_backend.py +8 -4
- sglang/srt/conversation.py +14 -1
- sglang/srt/distributed/__init__.py +3 -3
- sglang/srt/distributed/communication_op.py +2 -1
- sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +112 -42
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
- sglang/srt/distributed/device_communicators/pynccl.py +80 -1
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
- sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
- sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
- sglang/srt/distributed/parallel_state.py +1 -1
- sglang/srt/distributed/utils.py +2 -1
- sglang/srt/entrypoints/engine.py +452 -0
- sglang/srt/entrypoints/http_server.py +603 -0
- sglang/srt/function_call_parser.py +494 -0
- sglang/srt/layers/activation.py +8 -8
- sglang/srt/layers/attention/flashinfer_backend.py +10 -9
- sglang/srt/layers/attention/triton_backend.py +4 -6
- sglang/srt/layers/attention/vision.py +204 -0
- sglang/srt/layers/dp_attention.py +71 -0
- sglang/srt/layers/layernorm.py +5 -5
- sglang/srt/layers/linear.py +65 -14
- sglang/srt/layers/logits_processor.py +49 -64
- sglang/srt/layers/moe/ep_moe/layer.py +24 -16
- sglang/srt/layers/moe/fused_moe_native.py +84 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -7
- sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -5
- sglang/srt/layers/parameter.py +18 -8
- sglang/srt/layers/quantization/__init__.py +20 -23
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/fp8.py +10 -4
- sglang/srt/layers/quantization/modelopt_quant.py +1 -2
- sglang/srt/layers/quantization/w8a8_int8.py +1 -1
- sglang/srt/layers/radix_attention.py +2 -2
- sglang/srt/layers/rotary_embedding.py +1184 -31
- sglang/srt/layers/sampler.py +64 -6
- sglang/srt/layers/torchao_utils.py +12 -6
- sglang/srt/layers/vocab_parallel_embedding.py +2 -2
- sglang/srt/lora/lora.py +1 -9
- sglang/srt/managers/configure_logging.py +3 -0
- sglang/srt/managers/data_parallel_controller.py +79 -72
- sglang/srt/managers/detokenizer_manager.py +24 -6
- sglang/srt/managers/image_processor.py +158 -2
- sglang/srt/managers/io_struct.py +57 -3
- sglang/srt/managers/schedule_batch.py +78 -45
- sglang/srt/managers/schedule_policy.py +26 -12
- sglang/srt/managers/scheduler.py +326 -201
- sglang/srt/managers/session_controller.py +1 -0
- sglang/srt/managers/tokenizer_manager.py +210 -121
- sglang/srt/managers/tp_worker.py +6 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
- sglang/srt/managers/utils.py +44 -0
- sglang/srt/mem_cache/memory_pool.py +10 -32
- sglang/srt/metrics/collector.py +15 -6
- sglang/srt/model_executor/cuda_graph_runner.py +26 -30
- sglang/srt/model_executor/forward_batch_info.py +5 -7
- sglang/srt/model_executor/model_runner.py +44 -19
- sglang/srt/model_loader/loader.py +83 -6
- sglang/srt/model_loader/weight_utils.py +145 -6
- sglang/srt/models/baichuan.py +6 -6
- sglang/srt/models/chatglm.py +2 -2
- sglang/srt/models/commandr.py +17 -5
- sglang/srt/models/dbrx.py +13 -5
- sglang/srt/models/deepseek.py +3 -3
- sglang/srt/models/deepseek_v2.py +11 -11
- sglang/srt/models/exaone.py +2 -2
- sglang/srt/models/gemma.py +2 -2
- sglang/srt/models/gemma2.py +15 -25
- sglang/srt/models/gpt2.py +3 -5
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/granite.py +2 -2
- sglang/srt/models/grok.py +4 -3
- sglang/srt/models/internlm2.py +2 -2
- sglang/srt/models/llama.py +7 -5
- sglang/srt/models/minicpm.py +2 -2
- sglang/srt/models/minicpm3.py +9 -9
- sglang/srt/models/minicpmv.py +1238 -0
- sglang/srt/models/mixtral.py +3 -3
- sglang/srt/models/mixtral_quant.py +3 -3
- sglang/srt/models/mllama.py +2 -2
- sglang/srt/models/olmo.py +3 -3
- sglang/srt/models/olmo2.py +4 -4
- sglang/srt/models/olmoe.py +7 -13
- sglang/srt/models/phi3_small.py +2 -2
- sglang/srt/models/qwen.py +2 -2
- sglang/srt/models/qwen2.py +41 -4
- sglang/srt/models/qwen2_moe.py +3 -3
- sglang/srt/models/qwen2_vl.py +22 -122
- sglang/srt/models/stablelm.py +2 -2
- sglang/srt/models/torch_native_llama.py +20 -7
- sglang/srt/models/xverse.py +6 -6
- sglang/srt/models/xverse_moe.py +6 -6
- sglang/srt/openai_api/adapter.py +139 -37
- sglang/srt/openai_api/protocol.py +7 -4
- sglang/srt/sampling/custom_logit_processor.py +38 -0
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
- sglang/srt/sampling/sampling_batch_info.py +143 -18
- sglang/srt/sampling/sampling_params.py +3 -1
- sglang/srt/server.py +4 -1090
- sglang/srt/server_args.py +77 -15
- sglang/srt/speculative/eagle_utils.py +37 -15
- sglang/srt/speculative/eagle_worker.py +11 -13
- sglang/srt/utils.py +164 -129
- sglang/test/runners.py +8 -13
- sglang/test/test_programs.py +2 -1
- sglang/test/test_utils.py +83 -22
- sglang/utils.py +12 -2
- sglang/version.py +1 -1
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/METADATA +21 -10
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/RECORD +138 -123
- sglang/launch_server_llavavid.py +0 -25
- sglang/srt/constrained/__init__.py +0 -16
- sglang/srt/distributed/device_communicators/__init__.py +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,7 @@ import torch
|
|
24
24
|
|
25
25
|
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
26
26
|
from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
|
27
|
+
from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool
|
27
28
|
from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
|
28
29
|
|
29
30
|
# Clip the estimation of max_new_tokens for the request whose max_new_tokens is very large.
|
@@ -250,23 +251,24 @@ class PrefillAdder:
|
|
250
251
|
def __init__(
|
251
252
|
self,
|
252
253
|
tree_cache: BasePrefixCache,
|
254
|
+
token_to_kv_pool: BaseTokenToKVPool,
|
253
255
|
running_batch: ScheduleBatch,
|
254
256
|
new_token_ratio: float,
|
255
|
-
rem_total_tokens: int,
|
256
257
|
rem_input_tokens: int,
|
257
258
|
rem_chunk_tokens: Optional[int],
|
258
259
|
mixed_with_decode_tokens: int = 0,
|
259
260
|
):
|
260
261
|
self.tree_cache = tree_cache
|
262
|
+
self.token_to_kv_pool = token_to_kv_pool
|
261
263
|
self.running_batch = running_batch
|
262
264
|
self.new_token_ratio = new_token_ratio
|
263
|
-
self.rem_total_tokens = rem_total_tokens - mixed_with_decode_tokens
|
264
265
|
self.rem_input_tokens = rem_input_tokens - mixed_with_decode_tokens
|
265
266
|
self.rem_chunk_tokens = rem_chunk_tokens
|
266
267
|
if self.rem_chunk_tokens is not None:
|
267
268
|
self.rem_chunk_tokens -= mixed_with_decode_tokens
|
268
269
|
|
269
|
-
self.
|
270
|
+
self.rem_total_token_offset = mixed_with_decode_tokens
|
271
|
+
self.cur_rem_token_offset = mixed_with_decode_tokens
|
270
272
|
|
271
273
|
self.req_states = None
|
272
274
|
self.can_run_list = []
|
@@ -275,8 +277,7 @@ class PrefillAdder:
|
|
275
277
|
self.log_input_tokens = 0
|
276
278
|
|
277
279
|
if running_batch is not None:
|
278
|
-
|
279
|
-
self.rem_total_tokens -= sum(
|
280
|
+
self.rem_total_token_offset += sum(
|
280
281
|
[
|
281
282
|
min(
|
282
283
|
(r.sampling_params.max_new_tokens - len(r.output_ids)),
|
@@ -287,6 +288,22 @@ class PrefillAdder:
|
|
287
288
|
]
|
288
289
|
)
|
289
290
|
|
291
|
+
@property
|
292
|
+
def rem_total_tokens(self):
|
293
|
+
return (
|
294
|
+
self.token_to_kv_pool.available_size()
|
295
|
+
+ self.tree_cache.evictable_size()
|
296
|
+
- self.rem_total_token_offset
|
297
|
+
)
|
298
|
+
|
299
|
+
@property
|
300
|
+
def cur_rem_tokens(self):
|
301
|
+
return (
|
302
|
+
self.token_to_kv_pool.available_size()
|
303
|
+
+ self.tree_cache.evictable_size()
|
304
|
+
- self.cur_rem_token_offset
|
305
|
+
)
|
306
|
+
|
290
307
|
def budget_state(self):
|
291
308
|
if self.rem_total_tokens <= 0 or self.cur_rem_tokens <= 0:
|
292
309
|
return AddReqResult.NO_TOKEN
|
@@ -301,8 +318,8 @@ class PrefillAdder:
|
|
301
318
|
def _prefill_one_req(
|
302
319
|
self, prefix_len: int, extend_input_len: int, max_new_tokens: int
|
303
320
|
):
|
304
|
-
self.
|
305
|
-
self.
|
321
|
+
self.rem_total_token_offset += extend_input_len + max_new_tokens
|
322
|
+
self.cur_rem_token_offset += extend_input_len
|
306
323
|
self.rem_input_tokens -= extend_input_len
|
307
324
|
if self.rem_chunk_tokens is not None:
|
308
325
|
self.rem_chunk_tokens -= extend_input_len
|
@@ -332,12 +349,10 @@ class PrefillAdder:
|
|
332
349
|
@contextmanager
|
333
350
|
def _lock_node(self, last_node: TreeNode):
|
334
351
|
try:
|
335
|
-
|
336
|
-
self.rem_total_tokens += delta
|
352
|
+
self.tree_cache.inc_lock_ref(last_node)
|
337
353
|
yield None
|
338
354
|
finally:
|
339
|
-
|
340
|
-
self.rem_total_tokens += delta
|
355
|
+
self.tree_cache.dec_lock_ref(last_node)
|
341
356
|
|
342
357
|
def add_one_req_ignore_eos(self, req: Req):
|
343
358
|
def add_req_state(r, insert_sort=False):
|
@@ -433,7 +448,6 @@ class PrefillAdder:
|
|
433
448
|
or input_tokens <= self.rem_chunk_tokens
|
434
449
|
or (
|
435
450
|
req.return_logprob
|
436
|
-
and req.normalized_prompt_logprob is None
|
437
451
|
and req.logprob_start_len != len(req.origin_input_ids) - 1
|
438
452
|
)
|
439
453
|
):
|