sglang 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_serving.py +1 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/srt/configs/model_config.py +6 -0
- sglang/srt/conversation.py +6 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -1
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +196 -51
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +15 -9
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +18 -13
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +17 -12
- sglang/srt/disaggregation/prefill.py +128 -43
- sglang/srt/disaggregation/utils.py +127 -123
- sglang/srt/entrypoints/engine.py +15 -1
- sglang/srt/entrypoints/http_server.py +13 -2
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +15 -2
- sglang/srt/layers/attention/cutlass_mla_backend.py +38 -15
- sglang/srt/layers/attention/flashattention_backend.py +53 -64
- sglang/srt/layers/attention/flashinfer_backend.py +1 -2
- sglang/srt/layers/attention/flashinfer_mla_backend.py +22 -24
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/triton_backend.py +119 -119
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +23 -5
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/ep_moe/kernels.py +6 -5
- sglang/srt/layers/moe/ep_moe/layer.py +42 -32
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -4
- sglang/srt/layers/moe/topk.py +16 -8
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8_kernel.py +44 -15
- sglang/srt/layers/quantization/fp8_utils.py +87 -22
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/lora/lora_manager.py +79 -34
- sglang/srt/lora/mem_pool.py +4 -5
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/io_struct.py +28 -4
- sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +39 -6
- sglang/srt/managers/scheduler.py +73 -17
- sglang/srt/managers/tokenizer_manager.py +29 -2
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/model_executor/cuda_graph_runner.py +122 -55
- sglang/srt/model_executor/forward_batch_info.py +14 -5
- sglang/srt/model_executor/model_runner.py +6 -6
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_v2.py +113 -155
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +162 -4
- sglang/srt/openai_api/protocol.py +37 -1
- sglang/srt/sampling/sampling_batch_info.py +24 -0
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server_args.py +318 -233
- sglang/srt/speculative/build_eagle_tree.py +1 -1
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -3
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +5 -2
- sglang/srt/speculative/eagle_utils.py +389 -109
- sglang/srt/speculative/eagle_worker.py +134 -43
- sglang/srt/two_batch_overlap.py +4 -2
- sglang/srt/utils.py +58 -0
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +38 -3
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_utils.py +3 -1
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +5 -5
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +99 -88
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +0 -0
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
sglang/__init__.py
CHANGED
@@ -15,6 +15,7 @@ from sglang.api import (
|
|
15
15
|
get_server_info,
|
16
16
|
image,
|
17
17
|
select,
|
18
|
+
separate_reasoning,
|
18
19
|
set_default_backend,
|
19
20
|
system,
|
20
21
|
system_begin,
|
@@ -54,6 +55,7 @@ __all__ = [
|
|
54
55
|
"get_server_info",
|
55
56
|
"image",
|
56
57
|
"select",
|
58
|
+
"separate_reasoning",
|
57
59
|
"set_default_backend",
|
58
60
|
"system",
|
59
61
|
"system_begin",
|
sglang/api.py
CHANGED
@@ -15,6 +15,7 @@ from sglang.lang.ir import (
|
|
15
15
|
SglRoleBegin,
|
16
16
|
SglRoleEnd,
|
17
17
|
SglSelect,
|
18
|
+
SglSeparateReasoning,
|
18
19
|
SglVideo,
|
19
20
|
)
|
20
21
|
|
@@ -277,3 +278,9 @@ def assistant_begin():
|
|
277
278
|
|
278
279
|
def assistant_end():
|
279
280
|
return SglRoleEnd("assistant")
|
281
|
+
|
282
|
+
|
283
|
+
def separate_reasoning(
|
284
|
+
expr: Optional[SglExpr] = None, model_type: Optional[str] = None
|
285
|
+
):
|
286
|
+
return SglExprList([expr, SglSeparateReasoning(model_type, expr=expr)])
|
sglang/bench_serving.py
CHANGED
@@ -399,7 +399,7 @@ async def async_request_sglang_generate(
|
|
399
399
|
# NOTE: Some completion API might have a last
|
400
400
|
# usage summary response without a token so we
|
401
401
|
# want to check a token was generated
|
402
|
-
if data["text"]:
|
402
|
+
if "text" in data and data["text"]:
|
403
403
|
timestamp = time.perf_counter()
|
404
404
|
generated_text = data["text"]
|
405
405
|
output_len = data["meta_info"]["completion_tokens"]
|
sglang/lang/interpreter.py
CHANGED
@@ -26,6 +26,7 @@ from sglang.lang.ir import (
|
|
26
26
|
SglRoleBegin,
|
27
27
|
SglRoleEnd,
|
28
28
|
SglSelect,
|
29
|
+
SglSeparateReasoning,
|
29
30
|
SglVariable,
|
30
31
|
SglVarScopeBegin,
|
31
32
|
SglVarScopeEnd,
|
@@ -472,6 +473,8 @@ class StreamExecutor:
|
|
472
473
|
self._execute_concatenate_and_append_kv_cache(other)
|
473
474
|
else:
|
474
475
|
self._execute_concatenate_and_append_text(other)
|
476
|
+
elif isinstance(other, SglSeparateReasoning):
|
477
|
+
self._execute_separate_reasoning(other)
|
475
478
|
else:
|
476
479
|
raise ValueError(f"Unknown type: {type(other)}")
|
477
480
|
|
@@ -724,8 +727,44 @@ class StreamExecutor:
|
|
724
727
|
src_rids = [state.stream_executor.sid for state in expr.states]
|
725
728
|
self.backend.concatenate_and_append(src_rids, self.sid)
|
726
729
|
|
730
|
+
def _execute_separate_reasoning(self, expr: SglSeparateReasoning):
|
731
|
+
if self.stream:
|
732
|
+
# separate reasoning for stream is not supported
|
733
|
+
return
|
734
|
+
|
735
|
+
if (
|
736
|
+
self.cur_role == "assistant"
|
737
|
+
and self.num_api_spec_tokens is not None
|
738
|
+
and self.backend.is_chat_model
|
739
|
+
):
|
740
|
+
# Execute the stored lazy generation calls
|
741
|
+
self.backend.role_end_generate(self)
|
742
|
+
|
743
|
+
from sglang.srt.reasoning_parser import ReasoningParser
|
744
|
+
|
745
|
+
reasoning_parser = ReasoningParser(expr.model_type)
|
746
|
+
other = expr.expr
|
747
|
+
if not other:
|
748
|
+
return
|
749
|
+
elif isinstance(other, SglGen) or isinstance(other, SglSelect):
|
750
|
+
cur_text = self.get_var(other.name)
|
751
|
+
reasoning, normal_text = reasoning_parser.parse_non_stream(cur_text)
|
752
|
+
reasoning_name = expr.process_name_for_reasoning(other.name)
|
753
|
+
self.set_var(other.name, normal_text)
|
754
|
+
self.set_var(reasoning_name, reasoning)
|
755
|
+
# the variable is ready to be used
|
756
|
+
self.variable_event[reasoning_name].set()
|
757
|
+
self.text_ = self.text_[: self.cur_role_begin_pos] + normal_text
|
758
|
+
elif isinstance(other, SglExprList):
|
759
|
+
for x in other.expr_list:
|
760
|
+
self._execute_separate_reasoning(
|
761
|
+
SglSeparateReasoning(expr.model_type, x)
|
762
|
+
)
|
763
|
+
|
727
764
|
def _init_var_event(self, expr):
|
728
|
-
if isinstance(
|
765
|
+
if isinstance(
|
766
|
+
expr, (SglGen, SglSelect, SglVarScopeBegin, SglSeparateReasoning)
|
767
|
+
):
|
729
768
|
self.variable_event[expr.name] = threading.Event()
|
730
769
|
if self.stream:
|
731
770
|
self.stream_var_event[expr.name] = threading.Event()
|
sglang/lang/ir.py
CHANGED
@@ -606,3 +606,30 @@ class SglCommitLazy(SglExpr):
|
|
606
606
|
|
607
607
|
def __repr__(self):
|
608
608
|
return "CommitLazy()"
|
609
|
+
|
610
|
+
|
611
|
+
class SglSeparateReasoning(SglExpr):
|
612
|
+
def __init__(self, model_type: str, expr: SglExpr):
|
613
|
+
super().__init__()
|
614
|
+
self.model_type = model_type
|
615
|
+
|
616
|
+
self.expr = expr
|
617
|
+
self.name = None
|
618
|
+
self._process_expr(expr)
|
619
|
+
|
620
|
+
def process_name_for_reasoning(self, name):
|
621
|
+
if not name:
|
622
|
+
raise ValueError("name must be provided")
|
623
|
+
return f"{name}_reasoning_content"
|
624
|
+
|
625
|
+
def _process_expr(self, expr):
|
626
|
+
if isinstance(expr, SglGen):
|
627
|
+
self.name = self.process_name_for_reasoning(expr.name)
|
628
|
+
elif isinstance(expr, SglSelect):
|
629
|
+
self.name = self.process_name_for_reasoning(expr.name)
|
630
|
+
elif isinstance(expr, SglExprList):
|
631
|
+
for x in expr.expr_list:
|
632
|
+
self._process_expr(x)
|
633
|
+
|
634
|
+
def __repr__(self):
|
635
|
+
return f"SeparateReasoning(model_type={self.model_type}, name={self.name})"
|
sglang/math_utils.py
ADDED
@@ -550,6 +550,11 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
|
|
550
550
|
or "Qwen2ForRewardModel" in model_architectures
|
551
551
|
or "Qwen2ForSequenceClassification" in model_architectures
|
552
552
|
or "CLIPModel" in model_architectures
|
553
|
+
or "BertModel" in model_architectures
|
554
|
+
or "Contriever" in model_architectures
|
555
|
+
or "BertForSequenceClassification" in model_architectures
|
556
|
+
or "XLMRobertaModel" in model_architectures
|
557
|
+
or "XLMRobertaForSequenceClassification" in model_architectures
|
553
558
|
):
|
554
559
|
return False
|
555
560
|
else:
|
@@ -578,6 +583,7 @@ multimodal_model_archs = [
|
|
578
583
|
"KimiVLForConditionalGeneration",
|
579
584
|
"InternVLChatModel",
|
580
585
|
"Phi4MMForCausalLM",
|
586
|
+
"VILAForConditionalGeneration",
|
581
587
|
]
|
582
588
|
|
583
589
|
|
sglang/srt/conversation.py
CHANGED
@@ -983,3 +983,9 @@ def match_devstral(model_path: str):
|
|
983
983
|
def match_phi_4_mm(model_path: str):
|
984
984
|
if "phi-4-multimodal" in model_path.lower():
|
985
985
|
return "phi-4-mm"
|
986
|
+
|
987
|
+
|
988
|
+
@register_conv_template_matching_function
|
989
|
+
def match_vila(model_path: str):
|
990
|
+
if re.search(r"vila", model_path, re.IGNORECASE):
|
991
|
+
return "chatml"
|
@@ -1,23 +1,32 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
from abc import ABC, abstractmethod
|
2
|
-
from typing import Optional
|
4
|
+
from typing import TYPE_CHECKING, List, Optional
|
3
5
|
|
4
6
|
import numpy as np
|
5
7
|
import numpy.typing as npt
|
6
8
|
|
7
|
-
from sglang.srt.disaggregation.utils import DisaggregationMode
|
8
9
|
from sglang.srt.server_args import ServerArgs
|
9
10
|
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from sglang.srt.disaggregation.utils import DisaggregationMode
|
13
|
+
|
10
14
|
|
11
15
|
class KVArgs:
|
12
16
|
engine_rank: int
|
13
|
-
kv_data_ptrs:
|
14
|
-
kv_data_lens:
|
15
|
-
kv_item_lens:
|
16
|
-
aux_data_ptrs:
|
17
|
-
aux_data_lens:
|
18
|
-
aux_item_lens:
|
17
|
+
kv_data_ptrs: List[int]
|
18
|
+
kv_data_lens: List[int]
|
19
|
+
kv_item_lens: List[int]
|
20
|
+
aux_data_ptrs: List[int]
|
21
|
+
aux_data_lens: List[int]
|
22
|
+
aux_item_lens: List[int]
|
19
23
|
ib_device: str
|
24
|
+
ib_traffic_class: str
|
20
25
|
gpu_id: int
|
26
|
+
# for different tp
|
27
|
+
decode_tp_size: int
|
28
|
+
# for pp prefill
|
29
|
+
prefill_pp_size: int
|
21
30
|
|
22
31
|
|
23
32
|
class KVPoll:
|
@@ -45,7 +54,12 @@ class BaseKVSender(ABC):
|
|
45
54
|
|
46
55
|
@abstractmethod
|
47
56
|
def __init__(
|
48
|
-
self,
|
57
|
+
self,
|
58
|
+
mgr: BaseKVManager,
|
59
|
+
bootstrap_addr: str,
|
60
|
+
bootstrap_room: int,
|
61
|
+
dest_tp_ranks: List[int],
|
62
|
+
pp_rank: int,
|
49
63
|
): ...
|
50
64
|
|
51
65
|
@abstractmethod
|
@@ -56,7 +70,7 @@ class BaseKVSender(ABC):
|
|
56
70
|
...
|
57
71
|
|
58
72
|
@abstractmethod
|
59
|
-
def send(self, kv_indices: npt.NDArray[np.
|
73
|
+
def send(self, kv_indices: npt.NDArray[np.int32]):
|
60
74
|
"""
|
61
75
|
Send the kv cache at the given kv indices to the decoder server
|
62
76
|
"""
|
@@ -88,7 +102,7 @@ class BaseKVReceiver(ABC):
|
|
88
102
|
): ...
|
89
103
|
|
90
104
|
@abstractmethod
|
91
|
-
def init(self, kv_indices: npt.NDArray[np.
|
105
|
+
def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
|
92
106
|
"""
|
93
107
|
Notify the prefill server about the kv indices and aux index
|
94
108
|
"""
|
@@ -0,0 +1,42 @@
|
|
1
|
+
import threading
|
2
|
+
from collections import deque
|
3
|
+
from typing import List, Tuple
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import numpy.typing as npt
|
7
|
+
|
8
|
+
|
9
|
+
class FastQueue:
|
10
|
+
def __init__(self):
|
11
|
+
self._buf = deque()
|
12
|
+
self._cond = threading.Condition()
|
13
|
+
|
14
|
+
def put(self, item):
|
15
|
+
with self._cond:
|
16
|
+
self._buf.append(item)
|
17
|
+
# wake up a thread of wait()
|
18
|
+
self._cond.notify()
|
19
|
+
|
20
|
+
def get(self):
|
21
|
+
with self._cond:
|
22
|
+
# if queue is empty ,block until is notified()
|
23
|
+
while not self._buf:
|
24
|
+
self._cond.wait()
|
25
|
+
return self._buf.popleft()
|
26
|
+
|
27
|
+
|
28
|
+
def group_concurrent_contiguous(
|
29
|
+
src_indices: npt.NDArray[np.int32], dst_indices: npt.NDArray[np.int32]
|
30
|
+
) -> Tuple[List[npt.NDArray[np.int32]], List[npt.NDArray[np.int32]]]:
|
31
|
+
"""Vectorised NumPy implementation."""
|
32
|
+
if src_indices.size == 0:
|
33
|
+
return [], []
|
34
|
+
|
35
|
+
brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
|
36
|
+
src_groups = np.split(src_indices, brk)
|
37
|
+
dst_groups = np.split(dst_indices, brk)
|
38
|
+
|
39
|
+
src_groups = [g.tolist() for g in src_groups]
|
40
|
+
dst_groups = [g.tolist() for g in dst_groups]
|
41
|
+
|
42
|
+
return src_groups, dst_groups
|