sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. sglang/bench_offline_throughput.py +4 -2
  2. sglang/bench_one_batch.py +3 -13
  3. sglang/bench_one_batch_server.py +143 -15
  4. sglang/bench_serving.py +158 -8
  5. sglang/compile_deep_gemm.py +1 -1
  6. sglang/eval/loogle_eval.py +157 -0
  7. sglang/lang/chat_template.py +119 -75
  8. sglang/lang/tracer.py +1 -1
  9. sglang/srt/code_completion_parser.py +1 -1
  10. sglang/srt/configs/deepseekvl2.py +5 -2
  11. sglang/srt/configs/device_config.py +1 -1
  12. sglang/srt/configs/internvl.py +696 -0
  13. sglang/srt/configs/janus_pro.py +3 -0
  14. sglang/srt/configs/model_config.py +18 -0
  15. sglang/srt/constrained/base_grammar_backend.py +55 -72
  16. sglang/srt/constrained/llguidance_backend.py +25 -21
  17. sglang/srt/constrained/outlines_backend.py +27 -26
  18. sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
  19. sglang/srt/constrained/xgrammar_backend.py +71 -53
  20. sglang/srt/conversation.py +78 -46
  21. sglang/srt/disaggregation/base/conn.py +1 -0
  22. sglang/srt/disaggregation/decode.py +11 -3
  23. sglang/srt/disaggregation/fake/conn.py +1 -1
  24. sglang/srt/disaggregation/mini_lb.py +74 -23
  25. sglang/srt/disaggregation/mooncake/conn.py +236 -138
  26. sglang/srt/disaggregation/nixl/conn.py +242 -71
  27. sglang/srt/disaggregation/prefill.py +7 -4
  28. sglang/srt/disaggregation/utils.py +51 -2
  29. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
  30. sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
  31. sglang/srt/distributed/device_communicators/pynccl.py +2 -1
  32. sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
  33. sglang/srt/distributed/parallel_state.py +22 -1
  34. sglang/srt/entrypoints/engine.py +31 -4
  35. sglang/srt/entrypoints/http_server.py +45 -3
  36. sglang/srt/entrypoints/verl_engine.py +3 -2
  37. sglang/srt/function_call_parser.py +2 -2
  38. sglang/srt/hf_transformers_utils.py +20 -1
  39. sglang/srt/layers/attention/flashattention_backend.py +147 -51
  40. sglang/srt/layers/attention/flashinfer_backend.py +23 -13
  41. sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
  42. sglang/srt/layers/attention/merge_state.py +46 -0
  43. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
  44. sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
  45. sglang/srt/layers/attention/utils.py +4 -2
  46. sglang/srt/layers/attention/vision.py +290 -163
  47. sglang/srt/layers/dp_attention.py +71 -21
  48. sglang/srt/layers/layernorm.py +1 -1
  49. sglang/srt/layers/logits_processor.py +46 -11
  50. sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
  51. sglang/srt/layers/moe/ep_moe/layer.py +121 -2
  52. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  56. sglang/srt/layers/moe/topk.py +1 -1
  57. sglang/srt/layers/quantization/__init__.py +1 -1
  58. sglang/srt/layers/quantization/blockwise_int8.py +2 -2
  59. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
  60. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
  61. sglang/srt/layers/quantization/deep_gemm.py +77 -71
  62. sglang/srt/layers/quantization/fp8.py +110 -97
  63. sglang/srt/layers/quantization/fp8_kernel.py +81 -62
  64. sglang/srt/layers/quantization/fp8_utils.py +71 -23
  65. sglang/srt/layers/quantization/int8_kernel.py +2 -2
  66. sglang/srt/layers/quantization/kv_cache.py +3 -10
  67. sglang/srt/layers/quantization/utils.py +0 -5
  68. sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
  69. sglang/srt/layers/sampler.py +0 -4
  70. sglang/srt/layers/vocab_parallel_embedding.py +18 -7
  71. sglang/srt/lora/lora_manager.py +11 -14
  72. sglang/srt/lora/mem_pool.py +4 -4
  73. sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
  74. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  75. sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
  76. sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
  77. sglang/srt/lora/utils.py +1 -1
  78. sglang/srt/managers/cache_controller.py +115 -119
  79. sglang/srt/managers/data_parallel_controller.py +3 -3
  80. sglang/srt/managers/detokenizer_manager.py +21 -8
  81. sglang/srt/managers/io_struct.py +13 -1
  82. sglang/srt/managers/mm_utils.py +1 -1
  83. sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
  84. sglang/srt/managers/multimodal_processors/internvl.py +232 -0
  85. sglang/srt/managers/multimodal_processors/llava.py +46 -0
  86. sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
  87. sglang/srt/managers/schedule_batch.py +93 -23
  88. sglang/srt/managers/schedule_policy.py +11 -8
  89. sglang/srt/managers/scheduler.py +140 -100
  90. sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
  91. sglang/srt/managers/tokenizer_manager.py +157 -47
  92. sglang/srt/managers/tp_worker.py +21 -21
  93. sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
  94. sglang/srt/mem_cache/chunk_cache.py +2 -0
  95. sglang/srt/mem_cache/memory_pool.py +4 -2
  96. sglang/srt/metrics/collector.py +312 -37
  97. sglang/srt/model_executor/cuda_graph_runner.py +10 -11
  98. sglang/srt/model_executor/forward_batch_info.py +1 -1
  99. sglang/srt/model_executor/model_runner.py +57 -41
  100. sglang/srt/model_loader/loader.py +18 -11
  101. sglang/srt/models/clip.py +4 -4
  102. sglang/srt/models/deepseek_janus_pro.py +3 -3
  103. sglang/srt/models/deepseek_nextn.py +1 -20
  104. sglang/srt/models/deepseek_v2.py +77 -39
  105. sglang/srt/models/gemma3_mm.py +1 -1
  106. sglang/srt/models/internlm2.py +3 -0
  107. sglang/srt/models/internvl.py +670 -0
  108. sglang/srt/models/llama.py +3 -1
  109. sglang/srt/models/llama4.py +58 -13
  110. sglang/srt/models/llava.py +248 -5
  111. sglang/srt/models/minicpmv.py +1 -1
  112. sglang/srt/models/mixtral.py +98 -34
  113. sglang/srt/models/mllama.py +1 -1
  114. sglang/srt/models/phi3_small.py +16 -2
  115. sglang/srt/models/pixtral.py +467 -0
  116. sglang/srt/models/qwen2_5_vl.py +8 -4
  117. sglang/srt/models/qwen2_vl.py +4 -4
  118. sglang/srt/models/roberta.py +1 -1
  119. sglang/srt/models/torch_native_llama.py +1 -1
  120. sglang/srt/models/xiaomi_mimo.py +171 -0
  121. sglang/srt/openai_api/adapter.py +52 -42
  122. sglang/srt/openai_api/protocol.py +20 -16
  123. sglang/srt/reasoning_parser.py +1 -1
  124. sglang/srt/sampling/custom_logit_processor.py +18 -3
  125. sglang/srt/sampling/sampling_batch_info.py +2 -2
  126. sglang/srt/sampling/sampling_params.py +2 -0
  127. sglang/srt/server_args.py +64 -10
  128. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
  129. sglang/srt/speculative/eagle_utils.py +7 -7
  130. sglang/srt/speculative/eagle_worker.py +22 -19
  131. sglang/srt/utils.py +41 -6
  132. sglang/test/few_shot_gsm8k.py +2 -2
  133. sglang/test/few_shot_gsm8k_engine.py +2 -2
  134. sglang/test/run_eval.py +2 -2
  135. sglang/test/runners.py +8 -1
  136. sglang/test/send_one.py +13 -3
  137. sglang/test/simple_eval_common.py +1 -1
  138. sglang/test/simple_eval_humaneval.py +1 -1
  139. sglang/test/test_block_fp8.py +2 -2
  140. sglang/test/test_deepep_utils.py +219 -0
  141. sglang/test/test_programs.py +5 -5
  142. sglang/test/test_utils.py +92 -15
  143. sglang/utils.py +1 -1
  144. sglang/version.py +1 -1
  145. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
  146. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
  147. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
  148. /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
  149. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
  150. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  import threading
5
+ import time
4
6
  from typing import TYPE_CHECKING, List, Optional, Tuple, Union
5
7
 
8
+ from sglang.srt.disaggregation.utils import DisaggregationMode
6
9
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
7
10
  from sglang.srt.managers.io_struct import BatchEmbeddingOut, BatchTokenIDOut
8
11
  from sglang.srt.managers.schedule_batch import BaseFinishReason, Req, ScheduleBatch
@@ -15,6 +18,10 @@ if TYPE_CHECKING:
15
18
  Scheduler,
16
19
  )
17
20
 
21
+ logger = logging.getLogger(__name__)
22
+
23
+ DEFAULT_FORCE_STREAM_INTERVAL = 50
24
+
18
25
 
19
26
  class SchedulerOutputProcessorMixin:
20
27
  """
@@ -36,20 +43,16 @@ class SchedulerOutputProcessorMixin:
36
43
  next_token_ids,
37
44
  extend_input_len_per_req,
38
45
  extend_logprob_start_len_per_req,
39
- bid,
40
46
  ) = (
41
47
  result.logits_output,
42
48
  result.next_token_ids,
43
49
  result.extend_input_len_per_req,
44
50
  result.extend_logprob_start_len_per_req,
45
- result.bid,
46
51
  )
47
52
 
48
53
  if self.enable_overlap:
49
- logits_output, next_token_ids = (
50
- self.tp_worker.resolve_last_batch_result(
51
- launch_done,
52
- )
54
+ logits_output, next_token_ids, _ = (
55
+ self.tp_worker.resolve_last_batch_result(launch_done)
53
56
  )
54
57
  else:
55
58
  # Move next_token_ids and logprobs to cpu
@@ -85,6 +88,7 @@ class SchedulerOutputProcessorMixin:
85
88
 
86
89
  if req.finished():
87
90
  self.tree_cache.cache_finished_req(req)
91
+ req.time_stats.completion_time = time.time()
88
92
  elif not batch.decoding_reqs or req not in batch.decoding_reqs:
89
93
  # This updates radix so others can match
90
94
  self.tree_cache.cache_unfinished_req(req)
@@ -151,10 +155,7 @@ class SchedulerOutputProcessorMixin:
151
155
  )
152
156
  logprob_pt += num_input_logprobs
153
157
 
154
- if batch.next_batch_sampling_info:
155
- batch.next_batch_sampling_info.update_regex_vocab_mask()
156
- self.current_stream.synchronize()
157
- batch.next_batch_sampling_info.sampling_info_done.set()
158
+ self.set_next_batch_sampling_info_done(batch)
158
159
 
159
160
  else: # embedding or reward model
160
161
  embeddings, bid = result.embeddings, result.bid
@@ -187,16 +188,16 @@ class SchedulerOutputProcessorMixin:
187
188
  result: GenerationBatchResult,
188
189
  launch_done: Optional[threading.Event] = None,
189
190
  ):
190
- logits_output, next_token_ids, bid = (
191
+ logits_output, next_token_ids, can_run_cuda_graph = (
191
192
  result.logits_output,
192
193
  result.next_token_ids,
193
- result.bid,
194
+ result.can_run_cuda_graph,
194
195
  )
195
196
  self.num_generated_tokens += len(batch.reqs)
196
197
 
197
198
  if self.enable_overlap:
198
- logits_output, next_token_ids = self.tp_worker.resolve_last_batch_result(
199
- launch_done
199
+ logits_output, next_token_ids, can_run_cuda_graph = (
200
+ self.tp_worker.resolve_last_batch_result(launch_done)
200
201
  )
201
202
  next_token_logprobs = logits_output.next_token_logprobs
202
203
  elif batch.spec_algorithm.is_none():
@@ -235,6 +236,7 @@ class SchedulerOutputProcessorMixin:
235
236
  req.check_finished()
236
237
  if req.finished():
237
238
  self.tree_cache.cache_finished_req(req)
239
+ req.time_stats.completion_time = time.time()
238
240
 
239
241
  if req.return_logprob and batch.spec_algorithm.is_none():
240
242
  # speculative worker handles logprob in speculative decoding
@@ -264,13 +266,8 @@ class SchedulerOutputProcessorMixin:
264
266
  req.grammar.accept_token(next_token_id)
265
267
  req.grammar.finished = req.finished()
266
268
 
267
- if batch.next_batch_sampling_info:
268
- batch.next_batch_sampling_info.update_regex_vocab_mask()
269
- self.current_stream.synchronize()
270
- batch.next_batch_sampling_info.sampling_info_done.set()
271
-
269
+ self.set_next_batch_sampling_info_done(batch)
272
270
  self.stream_output(batch.reqs, batch.return_logprob)
273
-
274
271
  self.token_to_kv_pool_allocator.free_group_end()
275
272
 
276
273
  self.forward_ct_decode = (self.forward_ct_decode + 1) % (1 << 30)
@@ -278,7 +275,7 @@ class SchedulerOutputProcessorMixin:
278
275
  self.attn_tp_rank == 0
279
276
  and self.forward_ct_decode % self.server_args.decode_log_interval == 0
280
277
  ):
281
- self.log_decode_stats(running_batch=batch)
278
+ self.log_decode_stats(can_run_cuda_graph, running_batch=batch)
282
279
 
283
280
  def add_input_logprob_return_values(
284
281
  self: Scheduler,
@@ -512,29 +509,47 @@ class SchedulerOutputProcessorMixin:
512
509
  if self.model_config.is_multimodal_gen and req.to_abort:
513
510
  continue
514
511
 
515
- if (
516
- req.finished()
517
- # If stream, follow the given stream_interval
518
- or (req.stream and len(req.output_ids) % self.stream_interval == 0)
519
- # If not stream, we still want to output some tokens to get the benefit of incremental decoding.
520
- # TODO(lianmin): this is wrong for speculative decoding because len(req.output_ids) does not
521
- # always increase one-by-one.
522
- or (
523
- not req.stream
524
- and len(req.output_ids) % 50 == 0
525
- and not self.model_config.is_multimodal_gen
512
+ if req.finished():
513
+ if req.finished_output:
514
+ # With the overlap schedule, a request will try to output twice and hit this line twice
515
+ # because of the one additional delayed token. This "continue" prevented the dummy output.
516
+ continue
517
+ req.finished_output = True
518
+ should_output = True
519
+ else:
520
+ if req.stream:
521
+ stream_interval = (
522
+ req.sampling_params.stream_interval or self.stream_interval
523
+ )
524
+ should_output = len(req.output_ids) % stream_interval == 0
525
+ else:
526
+ should_output = (
527
+ len(req.output_ids) % DEFAULT_FORCE_STREAM_INTERVAL == 0
528
+ and not self.model_config.is_multimodal_gen
529
+ )
530
+
531
+ if should_output:
532
+ send_token_offset = req.send_token_offset
533
+ send_output_token_logprobs_offset = (
534
+ req.send_output_token_logprobs_offset
526
535
  )
527
- ):
528
536
  rids.append(req.rid)
529
537
  finished_reasons.append(
530
538
  req.finished_reason.to_json() if req.finished_reason else None
531
539
  )
532
540
  decoded_texts.append(req.decoded_text)
533
541
  decode_ids, read_offset = req.init_incremental_detokenize()
534
- decode_ids_list.append(decode_ids)
542
+
543
+ if self.model_config.is_multimodal_gen:
544
+ decode_ids_list.append(decode_ids)
545
+ else:
546
+ decode_ids_list.append(decode_ids[req.send_decode_id_offset :])
547
+
548
+ req.send_decode_id_offset = len(decode_ids)
535
549
  read_offsets.append(read_offset)
536
550
  if self.skip_tokenizer_init:
537
- output_ids.append(req.output_ids)
551
+ output_ids.append(req.output_ids[send_token_offset:])
552
+ req.send_token_offset = len(req.output_ids)
538
553
  skip_special_tokens.append(req.sampling_params.skip_special_tokens)
539
554
  spaces_between_special_tokens.append(
540
555
  req.sampling_params.spaces_between_special_tokens
@@ -548,36 +563,90 @@ class SchedulerOutputProcessorMixin:
548
563
  spec_verify_ct.append(req.spec_verify_ct)
549
564
 
550
565
  if return_logprob:
551
- input_token_logprobs_val.append(req.input_token_logprobs_val)
552
- input_token_logprobs_idx.append(req.input_token_logprobs_idx)
553
- output_token_logprobs_val.append(req.output_token_logprobs_val)
554
- output_token_logprobs_idx.append(req.output_token_logprobs_idx)
555
- input_top_logprobs_val.append(req.input_top_logprobs_val)
556
- input_top_logprobs_idx.append(req.input_top_logprobs_idx)
557
- output_top_logprobs_val.append(req.output_top_logprobs_val)
558
- output_top_logprobs_idx.append(req.output_top_logprobs_idx)
559
- input_token_ids_logprobs_val.append(
560
- req.input_token_ids_logprobs_val
561
- )
562
- input_token_ids_logprobs_idx.append(
563
- req.input_token_ids_logprobs_idx
564
- )
565
- output_token_ids_logprobs_val.append(
566
- req.output_token_ids_logprobs_val
567
- )
568
- output_token_ids_logprobs_idx.append(
569
- req.output_token_ids_logprobs_idx
570
- )
566
+ if (
567
+ req.return_logprob
568
+ and not req.input_logprob_sent
569
+ # Decode server does not send input logprobs
570
+ and self.disaggregation_mode != DisaggregationMode.DECODE
571
+ ):
572
+ input_token_logprobs_val.append(req.input_token_logprobs_val)
573
+ input_token_logprobs_idx.append(req.input_token_logprobs_idx)
574
+ input_top_logprobs_val.append(req.input_top_logprobs_val)
575
+ input_top_logprobs_idx.append(req.input_top_logprobs_idx)
576
+ input_token_ids_logprobs_val.append(
577
+ req.input_token_ids_logprobs_val
578
+ )
579
+ input_token_ids_logprobs_idx.append(
580
+ req.input_token_ids_logprobs_idx
581
+ )
582
+ req.input_logprob_sent = True
583
+ else:
584
+ input_token_logprobs_val.append([])
585
+ input_token_logprobs_idx.append([])
586
+ input_top_logprobs_val.append([])
587
+ input_top_logprobs_idx.append([])
588
+ input_token_ids_logprobs_val.append([])
589
+ input_token_ids_logprobs_idx.append([])
590
+
591
+ if req.return_logprob:
592
+ output_token_logprobs_val.append(
593
+ req.output_token_logprobs_val[
594
+ send_output_token_logprobs_offset:
595
+ ]
596
+ )
597
+ output_token_logprobs_idx.append(
598
+ req.output_token_logprobs_idx[
599
+ send_output_token_logprobs_offset:
600
+ ]
601
+ )
602
+ output_top_logprobs_val.append(
603
+ req.output_top_logprobs_val[
604
+ send_output_token_logprobs_offset:
605
+ ]
606
+ )
607
+ output_top_logprobs_idx.append(
608
+ req.output_top_logprobs_idx[
609
+ send_output_token_logprobs_offset:
610
+ ]
611
+ )
612
+ output_token_ids_logprobs_val.append(
613
+ req.output_token_ids_logprobs_val[
614
+ send_output_token_logprobs_offset:
615
+ ]
616
+ )
617
+ output_token_ids_logprobs_idx.append(
618
+ req.output_token_ids_logprobs_idx[
619
+ send_output_token_logprobs_offset:
620
+ ]
621
+ )
622
+ req.send_output_token_logprobs_offset = len(
623
+ req.output_token_logprobs_val
624
+ )
625
+ else:
626
+ output_token_logprobs_val.append([])
627
+ output_token_logprobs_idx.append([])
628
+ output_top_logprobs_val.append([])
629
+ output_top_logprobs_idx.append([])
630
+ output_token_ids_logprobs_val.append([])
631
+ output_token_ids_logprobs_idx.append([])
571
632
 
572
633
  if req.return_hidden_states:
573
634
  if output_hidden_states is None:
574
635
  output_hidden_states = []
575
636
  output_hidden_states.append(req.hidden_states)
576
637
 
638
+ if (
639
+ req.finished()
640
+ and self.tp_rank == 0
641
+ and self.server_args.enable_request_time_stats_logging
642
+ ):
643
+ req.log_time_stats()
644
+
577
645
  # Send to detokenizer
578
646
  if rids:
579
647
  if self.model_config.is_multimodal_gen:
580
648
  return
649
+
581
650
  self.send_to_detokenizer.send_pyobj(
582
651
  BatchTokenIDOut(
583
652
  rids,