sglang 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_serving.py +1 -1
  4. sglang/lang/interpreter.py +40 -1
  5. sglang/lang/ir.py +27 -0
  6. sglang/math_utils.py +8 -0
  7. sglang/srt/configs/model_config.py +6 -0
  8. sglang/srt/conversation.py +6 -0
  9. sglang/srt/disaggregation/base/__init__.py +1 -1
  10. sglang/srt/disaggregation/base/conn.py +25 -11
  11. sglang/srt/disaggregation/common/__init__.py +5 -1
  12. sglang/srt/disaggregation/common/utils.py +42 -0
  13. sglang/srt/disaggregation/decode.py +196 -51
  14. sglang/srt/disaggregation/fake/__init__.py +1 -1
  15. sglang/srt/disaggregation/fake/conn.py +15 -9
  16. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  17. sglang/srt/disaggregation/mooncake/conn.py +18 -13
  18. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  19. sglang/srt/disaggregation/nixl/conn.py +17 -12
  20. sglang/srt/disaggregation/prefill.py +128 -43
  21. sglang/srt/disaggregation/utils.py +127 -123
  22. sglang/srt/entrypoints/engine.py +15 -1
  23. sglang/srt/entrypoints/http_server.py +13 -2
  24. sglang/srt/eplb_simulator/__init__.py +1 -0
  25. sglang/srt/eplb_simulator/reader.py +51 -0
  26. sglang/srt/layers/activation.py +19 -0
  27. sglang/srt/layers/attention/aiter_backend.py +15 -2
  28. sglang/srt/layers/attention/cutlass_mla_backend.py +38 -15
  29. sglang/srt/layers/attention/flashattention_backend.py +53 -64
  30. sglang/srt/layers/attention/flashinfer_backend.py +1 -2
  31. sglang/srt/layers/attention/flashinfer_mla_backend.py +22 -24
  32. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  33. sglang/srt/layers/attention/triton_backend.py +119 -119
  34. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  35. sglang/srt/layers/attention/vision.py +51 -24
  36. sglang/srt/layers/communicator.py +23 -5
  37. sglang/srt/layers/linear.py +0 -4
  38. sglang/srt/layers/logits_processor.py +0 -12
  39. sglang/srt/layers/moe/ep_moe/kernels.py +6 -5
  40. sglang/srt/layers/moe/ep_moe/layer.py +42 -32
  41. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
  42. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -4
  43. sglang/srt/layers/moe/topk.py +16 -8
  44. sglang/srt/layers/pooler.py +56 -0
  45. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  46. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
  47. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  48. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  49. sglang/srt/layers/quantization/fp8_kernel.py +44 -15
  50. sglang/srt/layers/quantization/fp8_utils.py +87 -22
  51. sglang/srt/layers/radix_attention.py +2 -3
  52. sglang/srt/lora/lora_manager.py +79 -34
  53. sglang/srt/lora/mem_pool.py +4 -5
  54. sglang/srt/managers/cache_controller.py +2 -1
  55. sglang/srt/managers/io_struct.py +28 -4
  56. sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
  57. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  58. sglang/srt/managers/schedule_batch.py +39 -6
  59. sglang/srt/managers/scheduler.py +73 -17
  60. sglang/srt/managers/tokenizer_manager.py +29 -2
  61. sglang/srt/mem_cache/chunk_cache.py +1 -0
  62. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  63. sglang/srt/mem_cache/memory_pool.py +111 -407
  64. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  65. sglang/srt/mem_cache/radix_cache.py +36 -12
  66. sglang/srt/model_executor/cuda_graph_runner.py +122 -55
  67. sglang/srt/model_executor/forward_batch_info.py +14 -5
  68. sglang/srt/model_executor/model_runner.py +6 -6
  69. sglang/srt/model_loader/loader.py +8 -1
  70. sglang/srt/models/bert.py +113 -13
  71. sglang/srt/models/deepseek_v2.py +113 -155
  72. sglang/srt/models/internvl.py +46 -102
  73. sglang/srt/models/roberta.py +117 -9
  74. sglang/srt/models/vila.py +305 -0
  75. sglang/srt/openai_api/adapter.py +162 -4
  76. sglang/srt/openai_api/protocol.py +37 -1
  77. sglang/srt/sampling/sampling_batch_info.py +24 -0
  78. sglang/srt/sampling/sampling_params.py +2 -0
  79. sglang/srt/server_args.py +318 -233
  80. sglang/srt/speculative/build_eagle_tree.py +1 -1
  81. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -3
  82. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +5 -2
  83. sglang/srt/speculative/eagle_utils.py +389 -109
  84. sglang/srt/speculative/eagle_worker.py +134 -43
  85. sglang/srt/two_batch_overlap.py +4 -2
  86. sglang/srt/utils.py +58 -0
  87. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  88. sglang/test/runners.py +38 -3
  89. sglang/test/test_block_fp8.py +1 -0
  90. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  91. sglang/test/test_block_fp8_ep.py +1 -0
  92. sglang/test/test_utils.py +3 -1
  93. sglang/utils.py +9 -0
  94. sglang/version.py +1 -1
  95. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +5 -5
  96. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +99 -88
  97. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +0 -0
  98. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  99. {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
sglang/__init__.py CHANGED
@@ -15,6 +15,7 @@ from sglang.api import (
15
15
  get_server_info,
16
16
  image,
17
17
  select,
18
+ separate_reasoning,
18
19
  set_default_backend,
19
20
  system,
20
21
  system_begin,
@@ -54,6 +55,7 @@ __all__ = [
54
55
  "get_server_info",
55
56
  "image",
56
57
  "select",
58
+ "separate_reasoning",
57
59
  "set_default_backend",
58
60
  "system",
59
61
  "system_begin",
sglang/api.py CHANGED
@@ -15,6 +15,7 @@ from sglang.lang.ir import (
15
15
  SglRoleBegin,
16
16
  SglRoleEnd,
17
17
  SglSelect,
18
+ SglSeparateReasoning,
18
19
  SglVideo,
19
20
  )
20
21
 
@@ -277,3 +278,9 @@ def assistant_begin():
277
278
 
278
279
  def assistant_end():
279
280
  return SglRoleEnd("assistant")
281
+
282
+
283
+ def separate_reasoning(
284
+ expr: Optional[SglExpr] = None, model_type: Optional[str] = None
285
+ ):
286
+ return SglExprList([expr, SglSeparateReasoning(model_type, expr=expr)])
sglang/bench_serving.py CHANGED
@@ -399,7 +399,7 @@ async def async_request_sglang_generate(
399
399
  # NOTE: Some completion API might have a last
400
400
  # usage summary response without a token so we
401
401
  # want to check a token was generated
402
- if data["text"]:
402
+ if "text" in data and data["text"]:
403
403
  timestamp = time.perf_counter()
404
404
  generated_text = data["text"]
405
405
  output_len = data["meta_info"]["completion_tokens"]
@@ -26,6 +26,7 @@ from sglang.lang.ir import (
26
26
  SglRoleBegin,
27
27
  SglRoleEnd,
28
28
  SglSelect,
29
+ SglSeparateReasoning,
29
30
  SglVariable,
30
31
  SglVarScopeBegin,
31
32
  SglVarScopeEnd,
@@ -472,6 +473,8 @@ class StreamExecutor:
472
473
  self._execute_concatenate_and_append_kv_cache(other)
473
474
  else:
474
475
  self._execute_concatenate_and_append_text(other)
476
+ elif isinstance(other, SglSeparateReasoning):
477
+ self._execute_separate_reasoning(other)
475
478
  else:
476
479
  raise ValueError(f"Unknown type: {type(other)}")
477
480
 
@@ -724,8 +727,44 @@ class StreamExecutor:
724
727
  src_rids = [state.stream_executor.sid for state in expr.states]
725
728
  self.backend.concatenate_and_append(src_rids, self.sid)
726
729
 
730
+ def _execute_separate_reasoning(self, expr: SglSeparateReasoning):
731
+ if self.stream:
732
+ # separate reasoning for stream is not supported
733
+ return
734
+
735
+ if (
736
+ self.cur_role == "assistant"
737
+ and self.num_api_spec_tokens is not None
738
+ and self.backend.is_chat_model
739
+ ):
740
+ # Execute the stored lazy generation calls
741
+ self.backend.role_end_generate(self)
742
+
743
+ from sglang.srt.reasoning_parser import ReasoningParser
744
+
745
+ reasoning_parser = ReasoningParser(expr.model_type)
746
+ other = expr.expr
747
+ if not other:
748
+ return
749
+ elif isinstance(other, SglGen) or isinstance(other, SglSelect):
750
+ cur_text = self.get_var(other.name)
751
+ reasoning, normal_text = reasoning_parser.parse_non_stream(cur_text)
752
+ reasoning_name = expr.process_name_for_reasoning(other.name)
753
+ self.set_var(other.name, normal_text)
754
+ self.set_var(reasoning_name, reasoning)
755
+ # the variable is ready to be used
756
+ self.variable_event[reasoning_name].set()
757
+ self.text_ = self.text_[: self.cur_role_begin_pos] + normal_text
758
+ elif isinstance(other, SglExprList):
759
+ for x in other.expr_list:
760
+ self._execute_separate_reasoning(
761
+ SglSeparateReasoning(expr.model_type, x)
762
+ )
763
+
727
764
  def _init_var_event(self, expr):
728
- if isinstance(expr, (SglGen, SglSelect, SglVarScopeBegin)):
765
+ if isinstance(
766
+ expr, (SglGen, SglSelect, SglVarScopeBegin, SglSeparateReasoning)
767
+ ):
729
768
  self.variable_event[expr.name] = threading.Event()
730
769
  if self.stream:
731
770
  self.stream_var_event[expr.name] = threading.Event()
sglang/lang/ir.py CHANGED
@@ -606,3 +606,30 @@ class SglCommitLazy(SglExpr):
606
606
 
607
607
  def __repr__(self):
608
608
  return "CommitLazy()"
609
+
610
+
611
+ class SglSeparateReasoning(SglExpr):
612
+ def __init__(self, model_type: str, expr: SglExpr):
613
+ super().__init__()
614
+ self.model_type = model_type
615
+
616
+ self.expr = expr
617
+ self.name = None
618
+ self._process_expr(expr)
619
+
620
+ def process_name_for_reasoning(self, name):
621
+ if not name:
622
+ raise ValueError("name must be provided")
623
+ return f"{name}_reasoning_content"
624
+
625
+ def _process_expr(self, expr):
626
+ if isinstance(expr, SglGen):
627
+ self.name = self.process_name_for_reasoning(expr.name)
628
+ elif isinstance(expr, SglSelect):
629
+ self.name = self.process_name_for_reasoning(expr.name)
630
+ elif isinstance(expr, SglExprList):
631
+ for x in expr.expr_list:
632
+ self._process_expr(x)
633
+
634
+ def __repr__(self):
635
+ return f"SeparateReasoning(model_type={self.model_type}, name={self.name})"
sglang/math_utils.py ADDED
@@ -0,0 +1,8 @@
1
+ # COPIED FROM DeepGEMM
2
+ def align(x: int, y: int) -> int:
3
+ return ceil_div(x, y) * y
4
+
5
+
6
+ # COPIED FROM DeepGEMM
7
+ def ceil_div(x: int, y: int) -> int:
8
+ return (x + y - 1) // y
@@ -550,6 +550,11 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
550
550
  or "Qwen2ForRewardModel" in model_architectures
551
551
  or "Qwen2ForSequenceClassification" in model_architectures
552
552
  or "CLIPModel" in model_architectures
553
+ or "BertModel" in model_architectures
554
+ or "Contriever" in model_architectures
555
+ or "BertForSequenceClassification" in model_architectures
556
+ or "XLMRobertaModel" in model_architectures
557
+ or "XLMRobertaForSequenceClassification" in model_architectures
553
558
  ):
554
559
  return False
555
560
  else:
@@ -578,6 +583,7 @@ multimodal_model_archs = [
578
583
  "KimiVLForConditionalGeneration",
579
584
  "InternVLChatModel",
580
585
  "Phi4MMForCausalLM",
586
+ "VILAForConditionalGeneration",
581
587
  ]
582
588
 
583
589
 
@@ -983,3 +983,9 @@ def match_devstral(model_path: str):
983
983
  def match_phi_4_mm(model_path: str):
984
984
  if "phi-4-multimodal" in model_path.lower():
985
985
  return "phi-4-mm"
986
+
987
+
988
+ @register_conv_template_matching_function
989
+ def match_vila(model_path: str):
990
+ if re.search(r"vila", model_path, re.IGNORECASE):
991
+ return "chatml"
@@ -1,4 +1,4 @@
1
- from .conn import (
1
+ from sglang.srt.disaggregation.base.conn import (
2
2
  BaseKVBootstrapServer,
3
3
  BaseKVManager,
4
4
  BaseKVReceiver,
@@ -1,23 +1,32 @@
1
+ from __future__ import annotations
2
+
1
3
  from abc import ABC, abstractmethod
2
- from typing import Optional
4
+ from typing import TYPE_CHECKING, List, Optional
3
5
 
4
6
  import numpy as np
5
7
  import numpy.typing as npt
6
8
 
7
- from sglang.srt.disaggregation.utils import DisaggregationMode
8
9
  from sglang.srt.server_args import ServerArgs
9
10
 
11
+ if TYPE_CHECKING:
12
+ from sglang.srt.disaggregation.utils import DisaggregationMode
13
+
10
14
 
11
15
  class KVArgs:
12
16
  engine_rank: int
13
- kv_data_ptrs: list[int]
14
- kv_data_lens: list[int]
15
- kv_item_lens: list[int]
16
- aux_data_ptrs: list[int]
17
- aux_data_lens: list[int]
18
- aux_item_lens: list[int]
17
+ kv_data_ptrs: List[int]
18
+ kv_data_lens: List[int]
19
+ kv_item_lens: List[int]
20
+ aux_data_ptrs: List[int]
21
+ aux_data_lens: List[int]
22
+ aux_item_lens: List[int]
19
23
  ib_device: str
24
+ ib_traffic_class: str
20
25
  gpu_id: int
26
+ # for different tp
27
+ decode_tp_size: int
28
+ # for pp prefill
29
+ prefill_pp_size: int
21
30
 
22
31
 
23
32
  class KVPoll:
@@ -45,7 +54,12 @@ class BaseKVSender(ABC):
45
54
 
46
55
  @abstractmethod
47
56
  def __init__(
48
- self, mgr: BaseKVManager, bootstrap_addr: str, bootstrap_room: int
57
+ self,
58
+ mgr: BaseKVManager,
59
+ bootstrap_addr: str,
60
+ bootstrap_room: int,
61
+ dest_tp_ranks: List[int],
62
+ pp_rank: int,
49
63
  ): ...
50
64
 
51
65
  @abstractmethod
@@ -56,7 +70,7 @@ class BaseKVSender(ABC):
56
70
  ...
57
71
 
58
72
  @abstractmethod
59
- def send(self, kv_indices: npt.NDArray[np.int64]):
73
+ def send(self, kv_indices: npt.NDArray[np.int32]):
60
74
  """
61
75
  Send the kv cache at the given kv indices to the decoder server
62
76
  """
@@ -88,7 +102,7 @@ class BaseKVReceiver(ABC):
88
102
  ): ...
89
103
 
90
104
  @abstractmethod
91
- def init(self, kv_indices: npt.NDArray[np.int64], aux_index: Optional[int] = None):
105
+ def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
92
106
  """
93
107
  Notify the prefill server about the kv indices and aux index
94
108
  """
@@ -1 +1,5 @@
1
- from .conn import CommonKVBootstrapServer, CommonKVManager, CommonKVReceiver
1
+ from sglang.srt.disaggregation.common.conn import (
2
+ CommonKVBootstrapServer,
3
+ CommonKVManager,
4
+ CommonKVReceiver,
5
+ )
@@ -0,0 +1,42 @@
1
+ import threading
2
+ from collections import deque
3
+ from typing import List, Tuple
4
+
5
+ import numpy as np
6
+ import numpy.typing as npt
7
+
8
+
9
+ class FastQueue:
10
+ def __init__(self):
11
+ self._buf = deque()
12
+ self._cond = threading.Condition()
13
+
14
+ def put(self, item):
15
+ with self._cond:
16
+ self._buf.append(item)
17
+ # wake up a thread of wait()
18
+ self._cond.notify()
19
+
20
+ def get(self):
21
+ with self._cond:
22
+ # if queue is empty ,block until is notified()
23
+ while not self._buf:
24
+ self._cond.wait()
25
+ return self._buf.popleft()
26
+
27
+
28
+ def group_concurrent_contiguous(
29
+ src_indices: npt.NDArray[np.int32], dst_indices: npt.NDArray[np.int32]
30
+ ) -> Tuple[List[npt.NDArray[np.int32]], List[npt.NDArray[np.int32]]]:
31
+ """Vectorised NumPy implementation."""
32
+ if src_indices.size == 0:
33
+ return [], []
34
+
35
+ brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
36
+ src_groups = np.split(src_indices, brk)
37
+ dst_groups = np.split(dst_indices, brk)
38
+
39
+ src_groups = [g.tolist() for g in src_groups]
40
+ dst_groups = [g.tolist() for g in dst_groups]
41
+
42
+ return src_groups, dst_groups