PyPI - sglang - Versions diffs - 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl - Mend

sglang 0.5.1.post2py3-none-any.whl → 0.5.2rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

sglang/bench_one_batch.py +3 -0
sglang/bench_one_batch_server.py +79 -53
sglang/bench_serving.py +186 -14
sglang/profiler.py +0 -1
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/longcat_flash.py +104 -0
sglang/srt/configs/model_config.py +12 -0
sglang/srt/connector/__init__.py +1 -1
sglang/srt/connector/base_connector.py +1 -2
sglang/srt/connector/redis.py +2 -2
sglang/srt/connector/serde/__init__.py +1 -1
sglang/srt/connector/serde/safe_serde.py +4 -3
sglang/srt/conversation.py +38 -5
sglang/srt/disaggregation/ascend/conn.py +75 -0
sglang/srt/disaggregation/launch_lb.py +0 -13
sglang/srt/disaggregation/mini_lb.py +33 -8
sglang/srt/disaggregation/prefill.py +1 -1
sglang/srt/distributed/parallel_state.py +24 -14
sglang/srt/entrypoints/engine.py +19 -12
sglang/srt/entrypoints/http_server.py +174 -34
sglang/srt/entrypoints/openai/protocol.py +87 -24
sglang/srt/entrypoints/openai/serving_chat.py +50 -9
sglang/srt/entrypoints/openai/serving_completions.py +15 -0
sglang/srt/eplb/eplb_manager.py +26 -2
sglang/srt/eplb/expert_distribution.py +29 -2
sglang/srt/function_call/deepseekv31_detector.py +222 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/gpt_oss_detector.py +144 -256
sglang/srt/harmony_parser.py +588 -0
sglang/srt/hf_transformers_utils.py +26 -7
sglang/srt/layers/activation.py +12 -0
sglang/srt/layers/attention/ascend_backend.py +374 -136
sglang/srt/layers/attention/flashattention_backend.py +241 -7
sglang/srt/layers/attention/flashinfer_backend.py +5 -2
sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
sglang/srt/layers/communicator.py +1 -2
sglang/srt/layers/layernorm.py +28 -3
sglang/srt/layers/linear.py +3 -2
sglang/srt/layers/logits_processor.py +1 -1
sglang/srt/layers/moe/cutlass_moe.py +0 -8
sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
sglang/srt/layers/moe/ep_moe/layer.py +13 -13
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/topk.py +35 -12
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
sglang/srt/layers/quantization/fp8.py +2 -1
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +7 -0
sglang/srt/layers/quantization/mxfp4.py +25 -27
sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
sglang/srt/layers/quantization/utils.py +13 -0
sglang/srt/layers/quantization/w8a8_int8.py +7 -3
sglang/srt/layers/rotary_embedding.py +28 -1
sglang/srt/layers/sampler.py +29 -5
sglang/srt/layers/utils.py +0 -14
sglang/srt/managers/cache_controller.py +237 -204
sglang/srt/managers/detokenizer_manager.py +48 -2
sglang/srt/managers/io_struct.py +57 -0
sglang/srt/managers/mm_utils.py +5 -1
sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
sglang/srt/managers/scheduler.py +94 -9
sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
sglang/srt/managers/tokenizer_manager.py +122 -42
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +51 -23
sglang/srt/mem_cache/hiradix_cache.py +87 -71
sglang/srt/mem_cache/lora_radix_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +77 -14
sglang/srt/mem_cache/memory_pool_host.py +4 -5
sglang/srt/mem_cache/radix_cache.py +6 -4
sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
sglang/srt/mem_cache/swa_radix_cache.py +1 -1
sglang/srt/model_executor/model_runner.py +6 -5
sglang/srt/model_loader/loader.py +15 -24
sglang/srt/model_loader/utils.py +12 -0
sglang/srt/models/deepseek_v2.py +38 -13
sglang/srt/models/gpt_oss.py +2 -15
sglang/srt/models/llama_eagle3.py +4 -0
sglang/srt/models/longcat_flash.py +1015 -0
sglang/srt/models/longcat_flash_nextn.py +691 -0
sglang/srt/models/qwen2.py +26 -3
sglang/srt/models/qwen2_5_vl.py +66 -41
sglang/srt/models/qwen2_moe.py +22 -2
sglang/srt/models/transformers.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +4 -2
sglang/srt/reasoning_parser.py +56 -300
sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
sglang/srt/server_args.py +122 -56
sglang/srt/speculative/eagle_worker.py +28 -8
sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
sglang/srt/utils.py +73 -5
sglang/test/attention/test_trtllm_mla_backend.py +12 -3
sglang/version.py +1 -1
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -32,11 +32,14 @@ from sglang.srt.managers.io_struct import (
     BatchStrOut,
     BatchTokenIDOut,
     FreezeGCReq,
+    MultiTokenizerRegisterReq,
 )
+from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerMixin
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import (
     configure_logger,
     freeze_gc,
+    get_worker_ids_from_req_rids,
     get_zmq_socket,
     kill_itself_when_parent_died,
 )
@@ -67,7 +70,7 @@ class DecodeStatus:
     sent_offset: int = 0
-class DetokenizerManager:
+class DetokenizerManager(MultiTokenizerMixin):
     """DetokenizerManager is a process that detokenizes the token ids."""
     def __init__(
@@ -102,10 +105,13 @@ class DetokenizerManager:
                 (BatchEmbeddingOut, self.handle_batch_embedding_out),
                 (BatchTokenIDOut, self.handle_batch_token_id_out),
                 (BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
+                (MultiTokenizerRegisterReq, lambda x: x),
                 (FreezeGCReq, self.handle_freeze_gc_req),
             ]
         )
+        self.is_tool_call_parser_gpt_oss = server_args.tool_call_parser == "gpt-oss"
     def event_loop(self):
         """The event loop that handles requests"""
         while True:
@@ -114,6 +120,39 @@ class DetokenizerManager:
             if output is not None:
                 self.send_to_tokenizer.send_pyobj(output)
+    def multi_tokenizer_manager_event_loop(self):
+        """The event loop that handles requests, for multi tokenizer manager mode only"""
+        self.create_sockets_mapping()
+        while True:
+            recv_obj = self.recv_from_scheduler.recv_pyobj()
+            output = self._request_dispatcher(recv_obj)
+            if output is None:
+                continue
+            # Extract worker_id from rid
+            if isinstance(recv_obj.rids, list):
+                worker_ids = get_worker_ids_from_req_rids(recv_obj.rids)
+            else:
+                raise RuntimeError(
+                    f"for tokenizer_worker_num > 1, recv_obj.rids must be a list"
+                )
+            # Send data using the corresponding socket
+            for i, worker_id in enumerate(worker_ids):
+                if isinstance(recv_obj, MultiTokenizerRegisterReq):
+                    if self.register_tokenizer_ipc(recv_obj, worker_id):
+                        logger.info(
+                            f"DetokenizerManager Created ZMQ socket for worker {worker_id}"
+                        )
+                    continue
+                else:
+                    if worker_id not in self.tokenizer_mapping:
+                        logger.error(
+                            f"Tokenizer Worker ID {worker_id} not registered. Check if the server Process {worker_id} is alive"
+                        )
+                        continue
+                    new_output = self._handle_output_by_index(output, i)
+                    self.tokenizer_mapping[worker_id].send_pyobj(new_output)
     def trim_matched_stop(
         self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
     ):
@@ -133,6 +172,9 @@ class DetokenizerManager:
         # Trim stop token.
         if isinstance(matched, int) and isinstance(output, list):
+            # 200012 <|call|> is the tool call token and one of eos tokens for gpt-oss model
+            if output[-1] == 200012 and self.is_tool_call_parser_gpt_oss:
+                return output
             assert len(output) > 0
             return output[:-1]
         return output
@@ -280,8 +322,12 @@ def run_detokenizer_process(
     try:
         manager = DetokenizerManager(server_args, port_args)
-        manager.event_loop()
+        if server_args.tokenizer_worker_num > 1:
+            manager.multi_tokenizer_manager_event_loop()
+        else:
+            manager.event_loop()
     except Exception:
+        manager.clear_tokenizer_mapping()
         traceback = get_exception_traceback()
         logger.error(f"DetokenizerManager hit an exception: {traceback}")
         parent_process.send_signal(signal.SIGQUIT)

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -533,6 +533,21 @@ class TokenizedGenerateReqInput:
     dp_balance_id: int = -1
+@dataclass
+class BatchTokenizedGenerateReqInput:
+    # The batch of tokenized requests
+    batch: List[TokenizedGenerateReqInput]
+    def __len__(self):
+        return len(self.batch)
+    def __getitem__(self, i):
+        return self.batch[i]
+    def __iter__(self):
+        return iter(self.batch)
 @dataclass
 class EmbeddingReqInput:
     # The input prompt. It can be a single prompt or a batch of prompts.
@@ -668,6 +683,21 @@ class TokenizedEmbeddingReqInput:
     dp_balance_id: int = -1
+@dataclass
+class BatchTokenizedEmbeddingReqInput:
+    # The batch of tokenized embedding requests
+    batch: List[TokenizedEmbeddingReqInput]
+    def __len__(self):
+        return len(self.batch)
+    def __getitem__(self, i):
+        return self.batch[i]
+    def __iter__(self):
+        return iter(self.batch)
 @dataclass
 class BatchTokenIDOut:
     # The request id
@@ -784,6 +814,16 @@ class BatchEmbeddingOut:
     cached_tokens: List[int]
+@dataclass
+class ClearHiCacheReqInput:
+    pass
+@dataclass
+class ClearHiCacheReqOutput:
+    success: bool
 @dataclass
 class FlushCacheReqInput:
     pass
@@ -943,6 +983,11 @@ class AbortReq:
     abort_all: bool = False
     # The finished reason data
     finished_reason: Optional[Dict[str, Any]] = None
+    # used in MultiTokenzierManager mode
+    rids: Optional[Union[List[str], str]] = None
+    def __post_init__(self):
+        self.rids = self.rid
 @dataclass
@@ -1143,6 +1188,18 @@ class LoRAUpdateResult:
 LoadLoRAAdapterReqOutput = UnloadLoRAAdapterReqOutput = LoRAUpdateResult
+@dataclass
+class MultiTokenizerRegisterReq:
+    rids: Optional[Union[List[str], str]] = None
+    ipc_name: Optional[str] = None
+@dataclass
+class MultiTokenizerWarpper:
+    worker_id: int
+    obj: Optional[Any] = None
 class BlockReqType(Enum):
     BLOCK = 1
     UNBLOCK = 2

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -20,9 +20,11 @@ from sglang.srt.managers.schedule_batch import (
 )
 from sglang.srt.mem_cache.multimodal_cache import MultiModalCache
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.utils import flatten_nested_list, print_warning_once
+from sglang.srt.utils import flatten_nested_list, is_npu, print_warning_once
 from sglang.utils import logger
+_is_npu = is_npu()
 # NOTE: Using the shared logger from sglang.utils instead of creating a module-specific logger
 # to ensure consistent logging behavior across the codebase. This prevents issues with log
 # propagation that can cause some log messages (like 'server is fired up') to not appear
@@ -486,6 +488,8 @@ def get_embedding_and_mask(
         if embedding is None:
             return None, None
     # 2. Get mask
+    if _is_npu:
+        torch.npu.current_stream().synchronize()
     special_multimodal_mask = _get_multimodal_mask(input_ids, placeholder_tensor)
     # 3. Adjust embedding length if needed
     embedding = _adjust_embedding_length(embedding, special_multimodal_mask, logger)

sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl

sglang 0.5.1.post2py3-none-any.whl → 0.5.2rc0py3-none-any.whl