sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +4 -2
- sglang/bench_one_batch.py +3 -13
- sglang/bench_one_batch_server.py +143 -15
- sglang/bench_serving.py +158 -8
- sglang/compile_deep_gemm.py +1 -1
- sglang/eval/loogle_eval.py +157 -0
- sglang/lang/chat_template.py +119 -75
- sglang/lang/tracer.py +1 -1
- sglang/srt/code_completion_parser.py +1 -1
- sglang/srt/configs/deepseekvl2.py +5 -2
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/internvl.py +696 -0
- sglang/srt/configs/janus_pro.py +3 -0
- sglang/srt/configs/model_config.py +18 -0
- sglang/srt/constrained/base_grammar_backend.py +55 -72
- sglang/srt/constrained/llguidance_backend.py +25 -21
- sglang/srt/constrained/outlines_backend.py +27 -26
- sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
- sglang/srt/constrained/xgrammar_backend.py +71 -53
- sglang/srt/conversation.py +78 -46
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +11 -3
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +74 -23
- sglang/srt/disaggregation/mooncake/conn.py +236 -138
- sglang/srt/disaggregation/nixl/conn.py +242 -71
- sglang/srt/disaggregation/prefill.py +7 -4
- sglang/srt/disaggregation/utils.py +51 -2
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
- sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
- sglang/srt/distributed/device_communicators/pynccl.py +2 -1
- sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
- sglang/srt/distributed/parallel_state.py +22 -1
- sglang/srt/entrypoints/engine.py +31 -4
- sglang/srt/entrypoints/http_server.py +45 -3
- sglang/srt/entrypoints/verl_engine.py +3 -2
- sglang/srt/function_call_parser.py +2 -2
- sglang/srt/hf_transformers_utils.py +20 -1
- sglang/srt/layers/attention/flashattention_backend.py +147 -51
- sglang/srt/layers/attention/flashinfer_backend.py +23 -13
- sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
- sglang/srt/layers/attention/merge_state.py +46 -0
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
- sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
- sglang/srt/layers/attention/utils.py +4 -2
- sglang/srt/layers/attention/vision.py +290 -163
- sglang/srt/layers/dp_attention.py +71 -21
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/logits_processor.py +46 -11
- sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
- sglang/srt/layers/moe/ep_moe/layer.py +121 -2
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/topk.py +1 -1
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/blockwise_int8.py +2 -2
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
- sglang/srt/layers/quantization/deep_gemm.py +77 -71
- sglang/srt/layers/quantization/fp8.py +110 -97
- sglang/srt/layers/quantization/fp8_kernel.py +81 -62
- sglang/srt/layers/quantization/fp8_utils.py +71 -23
- sglang/srt/layers/quantization/int8_kernel.py +2 -2
- sglang/srt/layers/quantization/kv_cache.py +3 -10
- sglang/srt/layers/quantization/utils.py +0 -5
- sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
- sglang/srt/layers/sampler.py +0 -4
- sglang/srt/layers/vocab_parallel_embedding.py +18 -7
- sglang/srt/lora/lora_manager.py +11 -14
- sglang/srt/lora/mem_pool.py +4 -4
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
- sglang/srt/lora/utils.py +1 -1
- sglang/srt/managers/cache_controller.py +115 -119
- sglang/srt/managers/data_parallel_controller.py +3 -3
- sglang/srt/managers/detokenizer_manager.py +21 -8
- sglang/srt/managers/io_struct.py +13 -1
- sglang/srt/managers/mm_utils.py +1 -1
- sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
- sglang/srt/managers/multimodal_processors/internvl.py +232 -0
- sglang/srt/managers/multimodal_processors/llava.py +46 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
- sglang/srt/managers/schedule_batch.py +93 -23
- sglang/srt/managers/schedule_policy.py +11 -8
- sglang/srt/managers/scheduler.py +140 -100
- sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
- sglang/srt/managers/tokenizer_manager.py +157 -47
- sglang/srt/managers/tp_worker.py +21 -21
- sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
- sglang/srt/mem_cache/chunk_cache.py +2 -0
- sglang/srt/mem_cache/memory_pool.py +4 -2
- sglang/srt/metrics/collector.py +312 -37
- sglang/srt/model_executor/cuda_graph_runner.py +10 -11
- sglang/srt/model_executor/forward_batch_info.py +1 -1
- sglang/srt/model_executor/model_runner.py +57 -41
- sglang/srt/model_loader/loader.py +18 -11
- sglang/srt/models/clip.py +4 -4
- sglang/srt/models/deepseek_janus_pro.py +3 -3
- sglang/srt/models/deepseek_nextn.py +1 -20
- sglang/srt/models/deepseek_v2.py +77 -39
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/internlm2.py +3 -0
- sglang/srt/models/internvl.py +670 -0
- sglang/srt/models/llama.py +3 -1
- sglang/srt/models/llama4.py +58 -13
- sglang/srt/models/llava.py +248 -5
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mixtral.py +98 -34
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/phi3_small.py +16 -2
- sglang/srt/models/pixtral.py +467 -0
- sglang/srt/models/qwen2_5_vl.py +8 -4
- sglang/srt/models/qwen2_vl.py +4 -4
- sglang/srt/models/roberta.py +1 -1
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/models/xiaomi_mimo.py +171 -0
- sglang/srt/openai_api/adapter.py +52 -42
- sglang/srt/openai_api/protocol.py +20 -16
- sglang/srt/reasoning_parser.py +1 -1
- sglang/srt/sampling/custom_logit_processor.py +18 -3
- sglang/srt/sampling/sampling_batch_info.py +2 -2
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server_args.py +64 -10
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
- sglang/srt/speculative/eagle_utils.py +7 -7
- sglang/srt/speculative/eagle_worker.py +22 -19
- sglang/srt/utils.py +41 -6
- sglang/test/few_shot_gsm8k.py +2 -2
- sglang/test/few_shot_gsm8k_engine.py +2 -2
- sglang/test/run_eval.py +2 -2
- sglang/test/runners.py +8 -1
- sglang/test/send_one.py +13 -3
- sglang/test/simple_eval_common.py +1 -1
- sglang/test/simple_eval_humaneval.py +1 -1
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_deepep_utils.py +219 -0
- sglang/test/test_programs.py +5 -5
- sglang/test/test_utils.py +92 -15
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
- /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,11 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import logging
|
3
4
|
import threading
|
5
|
+
import time
|
4
6
|
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
|
5
7
|
|
8
|
+
from sglang.srt.disaggregation.utils import DisaggregationMode
|
6
9
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
7
10
|
from sglang.srt.managers.io_struct import BatchEmbeddingOut, BatchTokenIDOut
|
8
11
|
from sglang.srt.managers.schedule_batch import BaseFinishReason, Req, ScheduleBatch
|
@@ -15,6 +18,10 @@ if TYPE_CHECKING:
|
|
15
18
|
Scheduler,
|
16
19
|
)
|
17
20
|
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
DEFAULT_FORCE_STREAM_INTERVAL = 50
|
24
|
+
|
18
25
|
|
19
26
|
class SchedulerOutputProcessorMixin:
|
20
27
|
"""
|
@@ -36,20 +43,16 @@ class SchedulerOutputProcessorMixin:
|
|
36
43
|
next_token_ids,
|
37
44
|
extend_input_len_per_req,
|
38
45
|
extend_logprob_start_len_per_req,
|
39
|
-
bid,
|
40
46
|
) = (
|
41
47
|
result.logits_output,
|
42
48
|
result.next_token_ids,
|
43
49
|
result.extend_input_len_per_req,
|
44
50
|
result.extend_logprob_start_len_per_req,
|
45
|
-
result.bid,
|
46
51
|
)
|
47
52
|
|
48
53
|
if self.enable_overlap:
|
49
|
-
logits_output, next_token_ids = (
|
50
|
-
self.tp_worker.resolve_last_batch_result(
|
51
|
-
launch_done,
|
52
|
-
)
|
54
|
+
logits_output, next_token_ids, _ = (
|
55
|
+
self.tp_worker.resolve_last_batch_result(launch_done)
|
53
56
|
)
|
54
57
|
else:
|
55
58
|
# Move next_token_ids and logprobs to cpu
|
@@ -85,6 +88,7 @@ class SchedulerOutputProcessorMixin:
|
|
85
88
|
|
86
89
|
if req.finished():
|
87
90
|
self.tree_cache.cache_finished_req(req)
|
91
|
+
req.time_stats.completion_time = time.time()
|
88
92
|
elif not batch.decoding_reqs or req not in batch.decoding_reqs:
|
89
93
|
# This updates radix so others can match
|
90
94
|
self.tree_cache.cache_unfinished_req(req)
|
@@ -151,10 +155,7 @@ class SchedulerOutputProcessorMixin:
|
|
151
155
|
)
|
152
156
|
logprob_pt += num_input_logprobs
|
153
157
|
|
154
|
-
|
155
|
-
batch.next_batch_sampling_info.update_regex_vocab_mask()
|
156
|
-
self.current_stream.synchronize()
|
157
|
-
batch.next_batch_sampling_info.sampling_info_done.set()
|
158
|
+
self.set_next_batch_sampling_info_done(batch)
|
158
159
|
|
159
160
|
else: # embedding or reward model
|
160
161
|
embeddings, bid = result.embeddings, result.bid
|
@@ -187,16 +188,16 @@ class SchedulerOutputProcessorMixin:
|
|
187
188
|
result: GenerationBatchResult,
|
188
189
|
launch_done: Optional[threading.Event] = None,
|
189
190
|
):
|
190
|
-
logits_output, next_token_ids,
|
191
|
+
logits_output, next_token_ids, can_run_cuda_graph = (
|
191
192
|
result.logits_output,
|
192
193
|
result.next_token_ids,
|
193
|
-
result.
|
194
|
+
result.can_run_cuda_graph,
|
194
195
|
)
|
195
196
|
self.num_generated_tokens += len(batch.reqs)
|
196
197
|
|
197
198
|
if self.enable_overlap:
|
198
|
-
logits_output, next_token_ids =
|
199
|
-
launch_done
|
199
|
+
logits_output, next_token_ids, can_run_cuda_graph = (
|
200
|
+
self.tp_worker.resolve_last_batch_result(launch_done)
|
200
201
|
)
|
201
202
|
next_token_logprobs = logits_output.next_token_logprobs
|
202
203
|
elif batch.spec_algorithm.is_none():
|
@@ -235,6 +236,7 @@ class SchedulerOutputProcessorMixin:
|
|
235
236
|
req.check_finished()
|
236
237
|
if req.finished():
|
237
238
|
self.tree_cache.cache_finished_req(req)
|
239
|
+
req.time_stats.completion_time = time.time()
|
238
240
|
|
239
241
|
if req.return_logprob and batch.spec_algorithm.is_none():
|
240
242
|
# speculative worker handles logprob in speculative decoding
|
@@ -264,13 +266,8 @@ class SchedulerOutputProcessorMixin:
|
|
264
266
|
req.grammar.accept_token(next_token_id)
|
265
267
|
req.grammar.finished = req.finished()
|
266
268
|
|
267
|
-
|
268
|
-
batch.next_batch_sampling_info.update_regex_vocab_mask()
|
269
|
-
self.current_stream.synchronize()
|
270
|
-
batch.next_batch_sampling_info.sampling_info_done.set()
|
271
|
-
|
269
|
+
self.set_next_batch_sampling_info_done(batch)
|
272
270
|
self.stream_output(batch.reqs, batch.return_logprob)
|
273
|
-
|
274
271
|
self.token_to_kv_pool_allocator.free_group_end()
|
275
272
|
|
276
273
|
self.forward_ct_decode = (self.forward_ct_decode + 1) % (1 << 30)
|
@@ -278,7 +275,7 @@ class SchedulerOutputProcessorMixin:
|
|
278
275
|
self.attn_tp_rank == 0
|
279
276
|
and self.forward_ct_decode % self.server_args.decode_log_interval == 0
|
280
277
|
):
|
281
|
-
self.log_decode_stats(running_batch=batch)
|
278
|
+
self.log_decode_stats(can_run_cuda_graph, running_batch=batch)
|
282
279
|
|
283
280
|
def add_input_logprob_return_values(
|
284
281
|
self: Scheduler,
|
@@ -512,29 +509,47 @@ class SchedulerOutputProcessorMixin:
|
|
512
509
|
if self.model_config.is_multimodal_gen and req.to_abort:
|
513
510
|
continue
|
514
511
|
|
515
|
-
if (
|
516
|
-
req.
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
512
|
+
if req.finished():
|
513
|
+
if req.finished_output:
|
514
|
+
# With the overlap schedule, a request will try to output twice and hit this line twice
|
515
|
+
# because of the one additional delayed token. This "continue" prevented the dummy output.
|
516
|
+
continue
|
517
|
+
req.finished_output = True
|
518
|
+
should_output = True
|
519
|
+
else:
|
520
|
+
if req.stream:
|
521
|
+
stream_interval = (
|
522
|
+
req.sampling_params.stream_interval or self.stream_interval
|
523
|
+
)
|
524
|
+
should_output = len(req.output_ids) % stream_interval == 0
|
525
|
+
else:
|
526
|
+
should_output = (
|
527
|
+
len(req.output_ids) % DEFAULT_FORCE_STREAM_INTERVAL == 0
|
528
|
+
and not self.model_config.is_multimodal_gen
|
529
|
+
)
|
530
|
+
|
531
|
+
if should_output:
|
532
|
+
send_token_offset = req.send_token_offset
|
533
|
+
send_output_token_logprobs_offset = (
|
534
|
+
req.send_output_token_logprobs_offset
|
526
535
|
)
|
527
|
-
):
|
528
536
|
rids.append(req.rid)
|
529
537
|
finished_reasons.append(
|
530
538
|
req.finished_reason.to_json() if req.finished_reason else None
|
531
539
|
)
|
532
540
|
decoded_texts.append(req.decoded_text)
|
533
541
|
decode_ids, read_offset = req.init_incremental_detokenize()
|
534
|
-
|
542
|
+
|
543
|
+
if self.model_config.is_multimodal_gen:
|
544
|
+
decode_ids_list.append(decode_ids)
|
545
|
+
else:
|
546
|
+
decode_ids_list.append(decode_ids[req.send_decode_id_offset :])
|
547
|
+
|
548
|
+
req.send_decode_id_offset = len(decode_ids)
|
535
549
|
read_offsets.append(read_offset)
|
536
550
|
if self.skip_tokenizer_init:
|
537
|
-
output_ids.append(req.output_ids)
|
551
|
+
output_ids.append(req.output_ids[send_token_offset:])
|
552
|
+
req.send_token_offset = len(req.output_ids)
|
538
553
|
skip_special_tokens.append(req.sampling_params.skip_special_tokens)
|
539
554
|
spaces_between_special_tokens.append(
|
540
555
|
req.sampling_params.spaces_between_special_tokens
|
@@ -548,36 +563,90 @@ class SchedulerOutputProcessorMixin:
|
|
548
563
|
spec_verify_ct.append(req.spec_verify_ct)
|
549
564
|
|
550
565
|
if return_logprob:
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
req.
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
566
|
+
if (
|
567
|
+
req.return_logprob
|
568
|
+
and not req.input_logprob_sent
|
569
|
+
# Decode server does not send input logprobs
|
570
|
+
and self.disaggregation_mode != DisaggregationMode.DECODE
|
571
|
+
):
|
572
|
+
input_token_logprobs_val.append(req.input_token_logprobs_val)
|
573
|
+
input_token_logprobs_idx.append(req.input_token_logprobs_idx)
|
574
|
+
input_top_logprobs_val.append(req.input_top_logprobs_val)
|
575
|
+
input_top_logprobs_idx.append(req.input_top_logprobs_idx)
|
576
|
+
input_token_ids_logprobs_val.append(
|
577
|
+
req.input_token_ids_logprobs_val
|
578
|
+
)
|
579
|
+
input_token_ids_logprobs_idx.append(
|
580
|
+
req.input_token_ids_logprobs_idx
|
581
|
+
)
|
582
|
+
req.input_logprob_sent = True
|
583
|
+
else:
|
584
|
+
input_token_logprobs_val.append([])
|
585
|
+
input_token_logprobs_idx.append([])
|
586
|
+
input_top_logprobs_val.append([])
|
587
|
+
input_top_logprobs_idx.append([])
|
588
|
+
input_token_ids_logprobs_val.append([])
|
589
|
+
input_token_ids_logprobs_idx.append([])
|
590
|
+
|
591
|
+
if req.return_logprob:
|
592
|
+
output_token_logprobs_val.append(
|
593
|
+
req.output_token_logprobs_val[
|
594
|
+
send_output_token_logprobs_offset:
|
595
|
+
]
|
596
|
+
)
|
597
|
+
output_token_logprobs_idx.append(
|
598
|
+
req.output_token_logprobs_idx[
|
599
|
+
send_output_token_logprobs_offset:
|
600
|
+
]
|
601
|
+
)
|
602
|
+
output_top_logprobs_val.append(
|
603
|
+
req.output_top_logprobs_val[
|
604
|
+
send_output_token_logprobs_offset:
|
605
|
+
]
|
606
|
+
)
|
607
|
+
output_top_logprobs_idx.append(
|
608
|
+
req.output_top_logprobs_idx[
|
609
|
+
send_output_token_logprobs_offset:
|
610
|
+
]
|
611
|
+
)
|
612
|
+
output_token_ids_logprobs_val.append(
|
613
|
+
req.output_token_ids_logprobs_val[
|
614
|
+
send_output_token_logprobs_offset:
|
615
|
+
]
|
616
|
+
)
|
617
|
+
output_token_ids_logprobs_idx.append(
|
618
|
+
req.output_token_ids_logprobs_idx[
|
619
|
+
send_output_token_logprobs_offset:
|
620
|
+
]
|
621
|
+
)
|
622
|
+
req.send_output_token_logprobs_offset = len(
|
623
|
+
req.output_token_logprobs_val
|
624
|
+
)
|
625
|
+
else:
|
626
|
+
output_token_logprobs_val.append([])
|
627
|
+
output_token_logprobs_idx.append([])
|
628
|
+
output_top_logprobs_val.append([])
|
629
|
+
output_top_logprobs_idx.append([])
|
630
|
+
output_token_ids_logprobs_val.append([])
|
631
|
+
output_token_ids_logprobs_idx.append([])
|
571
632
|
|
572
633
|
if req.return_hidden_states:
|
573
634
|
if output_hidden_states is None:
|
574
635
|
output_hidden_states = []
|
575
636
|
output_hidden_states.append(req.hidden_states)
|
576
637
|
|
638
|
+
if (
|
639
|
+
req.finished()
|
640
|
+
and self.tp_rank == 0
|
641
|
+
and self.server_args.enable_request_time_stats_logging
|
642
|
+
):
|
643
|
+
req.log_time_stats()
|
644
|
+
|
577
645
|
# Send to detokenizer
|
578
646
|
if rids:
|
579
647
|
if self.model_config.is_multimodal_gen:
|
580
648
|
return
|
649
|
+
|
581
650
|
self.send_to_detokenizer.send_pyobj(
|
582
651
|
BatchTokenIDOut(
|
583
652
|
rids,
|