sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/bench_one_batch_server.py +79 -53
- sglang/bench_serving.py +186 -14
- sglang/profiler.py +0 -1
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +12 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/conversation.py +38 -5
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/launch_lb.py +0 -13
- sglang/srt/disaggregation/mini_lb.py +33 -8
- sglang/srt/disaggregation/prefill.py +1 -1
- sglang/srt/distributed/parallel_state.py +24 -14
- sglang/srt/entrypoints/engine.py +19 -12
- sglang/srt/entrypoints/http_server.py +174 -34
- sglang/srt/entrypoints/openai/protocol.py +87 -24
- sglang/srt/entrypoints/openai/serving_chat.py +50 -9
- sglang/srt/entrypoints/openai/serving_completions.py +15 -0
- sglang/srt/eplb/eplb_manager.py +26 -2
- sglang/srt/eplb/expert_distribution.py +29 -2
- sglang/srt/function_call/deepseekv31_detector.py +222 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/gpt_oss_detector.py +144 -256
- sglang/srt/harmony_parser.py +588 -0
- sglang/srt/hf_transformers_utils.py +26 -7
- sglang/srt/layers/activation.py +12 -0
- sglang/srt/layers/attention/ascend_backend.py +374 -136
- sglang/srt/layers/attention/flashattention_backend.py +241 -7
- sglang/srt/layers/attention/flashinfer_backend.py +5 -2
- sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
- sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
- sglang/srt/layers/communicator.py +1 -2
- sglang/srt/layers/layernorm.py +28 -3
- sglang/srt/layers/linear.py +3 -2
- sglang/srt/layers/logits_processor.py +1 -1
- sglang/srt/layers/moe/cutlass_moe.py +0 -8
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +13 -13
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/topk.py +35 -12
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
- sglang/srt/layers/quantization/fp8.py +2 -1
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/modelopt_quant.py +7 -0
- sglang/srt/layers/quantization/mxfp4.py +25 -27
- sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w8a8_int8.py +7 -3
- sglang/srt/layers/rotary_embedding.py +28 -1
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/layers/utils.py +0 -14
- sglang/srt/managers/cache_controller.py +237 -204
- sglang/srt/managers/detokenizer_manager.py +48 -2
- sglang/srt/managers/io_struct.py +57 -0
- sglang/srt/managers/mm_utils.py +5 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
- sglang/srt/managers/scheduler.py +94 -9
- sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/tokenizer_manager.py +122 -42
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +51 -23
- sglang/srt/mem_cache/hiradix_cache.py +87 -71
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +77 -14
- sglang/srt/mem_cache/memory_pool_host.py +4 -5
- sglang/srt/mem_cache/radix_cache.py +6 -4
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
- sglang/srt/mem_cache/swa_radix_cache.py +1 -1
- sglang/srt/model_executor/model_runner.py +6 -5
- sglang/srt/model_loader/loader.py +15 -24
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/models/deepseek_v2.py +38 -13
- sglang/srt/models/gpt_oss.py +2 -15
- sglang/srt/models/llama_eagle3.py +4 -0
- sglang/srt/models/longcat_flash.py +1015 -0
- sglang/srt/models/longcat_flash_nextn.py +691 -0
- sglang/srt/models/qwen2.py +26 -3
- sglang/srt/models/qwen2_5_vl.py +66 -41
- sglang/srt/models/qwen2_moe.py +22 -2
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/reasoning_parser.py +56 -300
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/server_args.py +122 -56
- sglang/srt/speculative/eagle_worker.py +28 -8
- sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- sglang/srt/utils.py +73 -5
- sglang/test/attention/test_trtllm_mla_backend.py +12 -3
- sglang/version.py +1 -1
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0
@@ -32,11 +32,14 @@ from sglang.srt.managers.io_struct import (
|
|
32
32
|
BatchStrOut,
|
33
33
|
BatchTokenIDOut,
|
34
34
|
FreezeGCReq,
|
35
|
+
MultiTokenizerRegisterReq,
|
35
36
|
)
|
37
|
+
from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerMixin
|
36
38
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
37
39
|
from sglang.srt.utils import (
|
38
40
|
configure_logger,
|
39
41
|
freeze_gc,
|
42
|
+
get_worker_ids_from_req_rids,
|
40
43
|
get_zmq_socket,
|
41
44
|
kill_itself_when_parent_died,
|
42
45
|
)
|
@@ -67,7 +70,7 @@ class DecodeStatus:
|
|
67
70
|
sent_offset: int = 0
|
68
71
|
|
69
72
|
|
70
|
-
class DetokenizerManager:
|
73
|
+
class DetokenizerManager(MultiTokenizerMixin):
|
71
74
|
"""DetokenizerManager is a process that detokenizes the token ids."""
|
72
75
|
|
73
76
|
def __init__(
|
@@ -102,10 +105,13 @@ class DetokenizerManager:
|
|
102
105
|
(BatchEmbeddingOut, self.handle_batch_embedding_out),
|
103
106
|
(BatchTokenIDOut, self.handle_batch_token_id_out),
|
104
107
|
(BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
|
108
|
+
(MultiTokenizerRegisterReq, lambda x: x),
|
105
109
|
(FreezeGCReq, self.handle_freeze_gc_req),
|
106
110
|
]
|
107
111
|
)
|
108
112
|
|
113
|
+
self.is_tool_call_parser_gpt_oss = server_args.tool_call_parser == "gpt-oss"
|
114
|
+
|
109
115
|
def event_loop(self):
|
110
116
|
"""The event loop that handles requests"""
|
111
117
|
while True:
|
@@ -114,6 +120,39 @@ class DetokenizerManager:
|
|
114
120
|
if output is not None:
|
115
121
|
self.send_to_tokenizer.send_pyobj(output)
|
116
122
|
|
123
|
+
def multi_tokenizer_manager_event_loop(self):
|
124
|
+
"""The event loop that handles requests, for multi tokenizer manager mode only"""
|
125
|
+
self.create_sockets_mapping()
|
126
|
+
while True:
|
127
|
+
recv_obj = self.recv_from_scheduler.recv_pyobj()
|
128
|
+
output = self._request_dispatcher(recv_obj)
|
129
|
+
if output is None:
|
130
|
+
continue
|
131
|
+
# Extract worker_id from rid
|
132
|
+
if isinstance(recv_obj.rids, list):
|
133
|
+
worker_ids = get_worker_ids_from_req_rids(recv_obj.rids)
|
134
|
+
else:
|
135
|
+
raise RuntimeError(
|
136
|
+
f"for tokenizer_worker_num > 1, recv_obj.rids must be a list"
|
137
|
+
)
|
138
|
+
|
139
|
+
# Send data using the corresponding socket
|
140
|
+
for i, worker_id in enumerate(worker_ids):
|
141
|
+
if isinstance(recv_obj, MultiTokenizerRegisterReq):
|
142
|
+
if self.register_tokenizer_ipc(recv_obj, worker_id):
|
143
|
+
logger.info(
|
144
|
+
f"DetokenizerManager Created ZMQ socket for worker {worker_id}"
|
145
|
+
)
|
146
|
+
continue
|
147
|
+
else:
|
148
|
+
if worker_id not in self.tokenizer_mapping:
|
149
|
+
logger.error(
|
150
|
+
f"Tokenizer Worker ID {worker_id} not registered. Check if the server Process {worker_id} is alive"
|
151
|
+
)
|
152
|
+
continue
|
153
|
+
new_output = self._handle_output_by_index(output, i)
|
154
|
+
self.tokenizer_mapping[worker_id].send_pyobj(new_output)
|
155
|
+
|
117
156
|
def trim_matched_stop(
|
118
157
|
self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
|
119
158
|
):
|
@@ -133,6 +172,9 @@ class DetokenizerManager:
|
|
133
172
|
|
134
173
|
# Trim stop token.
|
135
174
|
if isinstance(matched, int) and isinstance(output, list):
|
175
|
+
# 200012 <|call|> is the tool call token and one of eos tokens for gpt-oss model
|
176
|
+
if output[-1] == 200012 and self.is_tool_call_parser_gpt_oss:
|
177
|
+
return output
|
136
178
|
assert len(output) > 0
|
137
179
|
return output[:-1]
|
138
180
|
return output
|
@@ -280,8 +322,12 @@ def run_detokenizer_process(
|
|
280
322
|
|
281
323
|
try:
|
282
324
|
manager = DetokenizerManager(server_args, port_args)
|
283
|
-
|
325
|
+
if server_args.tokenizer_worker_num > 1:
|
326
|
+
manager.multi_tokenizer_manager_event_loop()
|
327
|
+
else:
|
328
|
+
manager.event_loop()
|
284
329
|
except Exception:
|
330
|
+
manager.clear_tokenizer_mapping()
|
285
331
|
traceback = get_exception_traceback()
|
286
332
|
logger.error(f"DetokenizerManager hit an exception: {traceback}")
|
287
333
|
parent_process.send_signal(signal.SIGQUIT)
|
sglang/srt/managers/io_struct.py
CHANGED
@@ -533,6 +533,21 @@ class TokenizedGenerateReqInput:
|
|
533
533
|
dp_balance_id: int = -1
|
534
534
|
|
535
535
|
|
536
|
+
@dataclass
|
537
|
+
class BatchTokenizedGenerateReqInput:
|
538
|
+
# The batch of tokenized requests
|
539
|
+
batch: List[TokenizedGenerateReqInput]
|
540
|
+
|
541
|
+
def __len__(self):
|
542
|
+
return len(self.batch)
|
543
|
+
|
544
|
+
def __getitem__(self, i):
|
545
|
+
return self.batch[i]
|
546
|
+
|
547
|
+
def __iter__(self):
|
548
|
+
return iter(self.batch)
|
549
|
+
|
550
|
+
|
536
551
|
@dataclass
|
537
552
|
class EmbeddingReqInput:
|
538
553
|
# The input prompt. It can be a single prompt or a batch of prompts.
|
@@ -668,6 +683,21 @@ class TokenizedEmbeddingReqInput:
|
|
668
683
|
dp_balance_id: int = -1
|
669
684
|
|
670
685
|
|
686
|
+
@dataclass
|
687
|
+
class BatchTokenizedEmbeddingReqInput:
|
688
|
+
# The batch of tokenized embedding requests
|
689
|
+
batch: List[TokenizedEmbeddingReqInput]
|
690
|
+
|
691
|
+
def __len__(self):
|
692
|
+
return len(self.batch)
|
693
|
+
|
694
|
+
def __getitem__(self, i):
|
695
|
+
return self.batch[i]
|
696
|
+
|
697
|
+
def __iter__(self):
|
698
|
+
return iter(self.batch)
|
699
|
+
|
700
|
+
|
671
701
|
@dataclass
|
672
702
|
class BatchTokenIDOut:
|
673
703
|
# The request id
|
@@ -784,6 +814,16 @@ class BatchEmbeddingOut:
|
|
784
814
|
cached_tokens: List[int]
|
785
815
|
|
786
816
|
|
817
|
+
@dataclass
|
818
|
+
class ClearHiCacheReqInput:
|
819
|
+
pass
|
820
|
+
|
821
|
+
|
822
|
+
@dataclass
|
823
|
+
class ClearHiCacheReqOutput:
|
824
|
+
success: bool
|
825
|
+
|
826
|
+
|
787
827
|
@dataclass
|
788
828
|
class FlushCacheReqInput:
|
789
829
|
pass
|
@@ -943,6 +983,11 @@ class AbortReq:
|
|
943
983
|
abort_all: bool = False
|
944
984
|
# The finished reason data
|
945
985
|
finished_reason: Optional[Dict[str, Any]] = None
|
986
|
+
# used in MultiTokenzierManager mode
|
987
|
+
rids: Optional[Union[List[str], str]] = None
|
988
|
+
|
989
|
+
def __post_init__(self):
|
990
|
+
self.rids = self.rid
|
946
991
|
|
947
992
|
|
948
993
|
@dataclass
|
@@ -1143,6 +1188,18 @@ class LoRAUpdateResult:
|
|
1143
1188
|
LoadLoRAAdapterReqOutput = UnloadLoRAAdapterReqOutput = LoRAUpdateResult
|
1144
1189
|
|
1145
1190
|
|
1191
|
+
@dataclass
|
1192
|
+
class MultiTokenizerRegisterReq:
|
1193
|
+
rids: Optional[Union[List[str], str]] = None
|
1194
|
+
ipc_name: Optional[str] = None
|
1195
|
+
|
1196
|
+
|
1197
|
+
@dataclass
|
1198
|
+
class MultiTokenizerWarpper:
|
1199
|
+
worker_id: int
|
1200
|
+
obj: Optional[Any] = None
|
1201
|
+
|
1202
|
+
|
1146
1203
|
class BlockReqType(Enum):
|
1147
1204
|
BLOCK = 1
|
1148
1205
|
UNBLOCK = 2
|
sglang/srt/managers/mm_utils.py
CHANGED
@@ -20,9 +20,11 @@ from sglang.srt.managers.schedule_batch import (
|
|
20
20
|
)
|
21
21
|
from sglang.srt.mem_cache.multimodal_cache import MultiModalCache
|
22
22
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
23
|
-
from sglang.srt.utils import flatten_nested_list, print_warning_once
|
23
|
+
from sglang.srt.utils import flatten_nested_list, is_npu, print_warning_once
|
24
24
|
from sglang.utils import logger
|
25
25
|
|
26
|
+
_is_npu = is_npu()
|
27
|
+
|
26
28
|
# NOTE: Using the shared logger from sglang.utils instead of creating a module-specific logger
|
27
29
|
# to ensure consistent logging behavior across the codebase. This prevents issues with log
|
28
30
|
# propagation that can cause some log messages (like 'server is fired up') to not appear
|
@@ -486,6 +488,8 @@ def get_embedding_and_mask(
|
|
486
488
|
if embedding is None:
|
487
489
|
return None, None
|
488
490
|
# 2. Get mask
|
491
|
+
if _is_npu:
|
492
|
+
torch.npu.current_stream().synchronize()
|
489
493
|
special_multimodal_mask = _get_multimodal_mask(input_ids, placeholder_tensor)
|
490
494
|
# 3. Adjust embedding length if needed
|
491
495
|
embedding = _adjust_embedding_length(embedding, special_multimodal_mask, logger)
|