PyPI - sglang - Versions diffs - 0.4.5.post1__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl - Mend

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

sglang/__init__.py +2 -4
sglang/bench_one_batch.py +2 -2
sglang/bench_serving.py +3 -6
sglang/compile_deep_gemm.py +136 -0
sglang/lang/backend/anthropic.py +0 -4
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/openai.py +6 -2
sglang/lang/backend/runtime_endpoint.py +5 -1
sglang/lang/backend/vertexai.py +0 -1
sglang/lang/compiler.py +1 -7
sglang/lang/tracer.py +3 -7
sglang/srt/_custom_ops.py +0 -2
sglang/srt/configs/model_config.py +4 -1
sglang/srt/constrained/outlines_jump_forward.py +14 -1
sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
sglang/srt/constrained/xgrammar_backend.py +27 -4
sglang/srt/custom_op.py +0 -62
sglang/srt/disaggregation/decode.py +105 -6
sglang/srt/disaggregation/mini_lb.py +74 -9
sglang/srt/disaggregation/mooncake/conn.py +33 -63
sglang/srt/disaggregation/mooncake/transfer_engine.py +30 -61
sglang/srt/disaggregation/nixl/__init__.py +1 -0
sglang/srt/disaggregation/nixl/conn.py +622 -0
sglang/srt/disaggregation/prefill.py +137 -17
sglang/srt/disaggregation/utils.py +32 -0
sglang/srt/entrypoints/engine.py +4 -0
sglang/srt/entrypoints/http_server.py +3 -7
sglang/srt/entrypoints/verl_engine.py +7 -5
sglang/srt/function_call_parser.py +60 -0
sglang/srt/layers/activation.py +6 -8
sglang/srt/layers/attention/flashattention_backend.py +883 -209
sglang/srt/layers/attention/flashinfer_backend.py +5 -2
sglang/srt/layers/attention/torch_native_backend.py +6 -1
sglang/srt/layers/attention/triton_backend.py +6 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/extend_attention.py +18 -7
sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
sglang/srt/layers/dp_attention.py +1 -1
sglang/srt/layers/layernorm.py +20 -5
sglang/srt/layers/linear.py +17 -3
sglang/srt/layers/moe/ep_moe/layer.py +17 -29
sglang/srt/layers/moe/fused_moe_native.py +4 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +14 -19
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/topk.py +27 -30
sglang/srt/layers/parameter.py +0 -2
sglang/srt/layers/quantization/__init__.py +1 -0
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +9 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +16 -44
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
sglang/srt/layers/quantization/deep_gemm.py +378 -0
sglang/srt/layers/quantization/fp8.py +115 -132
sglang/srt/layers/quantization/fp8_kernel.py +213 -88
sglang/srt/layers/quantization/fp8_utils.py +189 -264
sglang/srt/layers/quantization/gptq.py +13 -7
sglang/srt/layers/quantization/modelopt_quant.py +2 -2
sglang/srt/layers/quantization/moe_wna16.py +2 -0
sglang/srt/layers/quantization/utils.py +5 -11
sglang/srt/layers/quantization/w8a8_fp8.py +2 -0
sglang/srt/layers/quantization/w8a8_int8.py +7 -7
sglang/srt/layers/radix_attention.py +15 -0
sglang/srt/layers/rotary_embedding.py +9 -8
sglang/srt/layers/sampler.py +7 -12
sglang/srt/lora/backend/base_backend.py +18 -2
sglang/srt/lora/backend/flashinfer_backend.py +1 -1
sglang/srt/lora/backend/triton_backend.py +1 -1
sglang/srt/lora/layers.py +1 -1
sglang/srt/lora/lora.py +1 -1
sglang/srt/lora/lora_manager.py +1 -1
sglang/srt/managers/data_parallel_controller.py +7 -1
sglang/srt/managers/detokenizer_manager.py +0 -1
sglang/srt/managers/io_struct.py +15 -3
sglang/srt/managers/mm_utils.py +4 -3
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +3 -2
sglang/srt/managers/schedule_batch.py +15 -4
sglang/srt/managers/scheduler.py +28 -77
sglang/srt/managers/tokenizer_manager.py +116 -29
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +41 -29
sglang/srt/mem_cache/memory_pool.py +38 -15
sglang/srt/model_executor/cuda_graph_runner.py +15 -10
sglang/srt/model_executor/model_runner.py +39 -31
sglang/srt/models/bert.py +398 -0
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_nextn.py +74 -70
sglang/srt/models/deepseek_v2.py +292 -348
sglang/srt/models/llama.py +5 -5
sglang/srt/models/minicpm3.py +31 -203
sglang/srt/models/minicpmo.py +17 -6
sglang/srt/models/qwen2.py +4 -1
sglang/srt/models/qwen2_moe.py +14 -13
sglang/srt/models/qwen3.py +335 -0
sglang/srt/models/qwen3_moe.py +423 -0
sglang/srt/openai_api/adapter.py +71 -4
sglang/srt/openai_api/protocol.py +6 -1
sglang/srt/reasoning_parser.py +0 -1
sglang/srt/sampling/sampling_batch_info.py +2 -3
sglang/srt/server_args.py +86 -72
sglang/srt/speculative/build_eagle_tree.py +2 -2
sglang/srt/speculative/eagle_utils.py +2 -2
sglang/srt/speculative/eagle_worker.py +6 -14
sglang/srt/utils.py +62 -6
sglang/test/runners.py +5 -1
sglang/test/test_block_fp8.py +167 -0
sglang/test/test_custom_ops.py +1 -1
sglang/test/test_utils.py +3 -1
sglang/version.py +1 -1
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +5 -5
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +116 -110
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +1 -1
sglang/lang/__init__.py +0 -0
sglang/srt/lora/backend/__init__.py +0 -25
sglang/srt/server.py +0 -18
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0

sglang/srt/disaggregation/prefill.py CHANGED Viewed

@@ -20,6 +20,7 @@ Life cycle of a request in the prefill server
 from __future__ import annotations
 import logging
+from collections import deque
 from typing import TYPE_CHECKING, List, Optional
 import torch
@@ -31,6 +32,8 @@ from sglang.srt.disaggregation.utils import (
     ReqToMetadataIdxAllocator,
     TransferBackend,
     get_kv_class,
+    kv_to_page_indices,
+    kv_to_page_num,
     poll_and_all_reduce,
 )
 from sglang.srt.managers.schedule_batch import FINISH_LENGTH, Req, ScheduleBatch
@@ -103,7 +106,7 @@ class PrefillBootstrapQueue:
         kv_args.aux_item_lens = [
             metadata_buffer[0].nbytes for metadata_buffer in self.metadata_buffers
         ]
-        kv_args.ib_device = "mock-ib-device"
+        kv_args.ib_device = self.scheduler.server_args.disaggregation_ib_device
         kv_args.gpu_id = self.scheduler.gpu_id
         kv_manager_class = get_kv_class(self.transfer_backend, KVClassType.MANAGER)
         kv_manager = kv_manager_class(
@@ -154,7 +157,8 @@ class PrefillBootstrapQueue:
                 self.req_to_metadata_buffer_idx_allocator.alloc()
             )
             assert req.metadata_buffer_index is not None
-            req.disagg_kv_sender.init(num_kv_indices, req.metadata_buffer_index)
+            num_pages = kv_to_page_num(num_kv_indices, self.token_to_kv_pool.page_size)
+            req.disagg_kv_sender.init(num_pages, req.metadata_buffer_index)
             bootstrapped_reqs.append(req)
             indices_to_remove.add(i)
@@ -171,6 +175,70 @@ class SchedulerDisaggregationPrefillMixin:
     Mixin for Scheduler to handle disaggregation prefill
     """
+    @torch.no_grad()
+    def event_loop_normal_disagg_prefill(self):
+        """A normal scheduler loop for prefill worker in disaggregation mode."""
+        while True:
+            recv_reqs = self.recv_requests()
+            self.process_input_requests(recv_reqs)
+            self.waiting_queue.extend(
+                self.disagg_prefill_pending_queue.pop_bootstrapped()
+            )
+            self.process_prefill_chunk()
+            batch = self.get_new_batch_prefill()
+            self.cur_batch = batch
+            if batch:
+                result = self.run_batch(batch)
+                self.process_batch_result_disagg_prefill(batch, result)
+            if len(self.disagg_prefill_inflight_queue) > 0:
+                self.process_disagg_prefill_inflight_queue()
+            if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
+                self.check_memory()
+                self.new_token_ratio = self.init_new_token_ratio
+            self.last_batch = batch
+            # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
+            # Otherwise, it hangs under high concurrency
+            self.running_batch.batch_is_full = False
+    @torch.no_grad()
+    def event_loop_overlap_disagg_prefill(self):
+        self.result_queue = deque()
+        while True:
+            recv_reqs = self.recv_requests()
+            self.process_input_requests(recv_reqs)
+            self.waiting_queue.extend(
+                self.disagg_prefill_pending_queue.pop_bootstrapped()
+            )
+            self.process_prefill_chunk()
+            batch = self.get_new_batch_prefill()
+            self.cur_batch = batch
+            if batch:
+                result = self.run_batch(batch)
+                self.result_queue.append((batch.copy(), result))
+            if self.last_batch:
+                tmp_batch, tmp_result = self.result_queue.popleft()
+                self.process_batch_result_disagg_prefill(tmp_batch, tmp_result)
+            if len(self.disagg_prefill_inflight_queue) > 0:
+                self.process_disagg_prefill_inflight_queue()
+            if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
+                self.check_memory()
+                self.new_token_ratio = self.init_new_token_ratio
+            self.last_batch = batch
+            # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
+            # Otherwise, it hangs under high concurrency
+            self.running_batch.batch_is_full = False
     def process_batch_result_disagg_prefill(
         self: Scheduler, batch: ScheduleBatch, result: GenerationBatchResult
     ) -> None:
@@ -179,7 +247,26 @@ class SchedulerDisaggregationPrefillMixin:
         Adapted from process_batch_result_prefill
         """
-        next_token_ids = result.next_token_ids.tolist()
+        (
+            logits_output,
+            next_token_ids,
+            extend_input_len_per_req,
+            extend_logprob_start_len_per_req,
+            bid,
+        ) = (
+            result.logits_output,
+            result.next_token_ids,
+            result.extend_input_len_per_req,
+            result.extend_logprob_start_len_per_req,
+            result.bid,
+        )
+        # Transfer kv for prefill completed requests and add it into disagg_prefill_infight_queue
+        if self.enable_overlap:
+            # wait
+            _, next_token_ids = self.tp_worker.resolve_batch_result(bid)
+        else:
+            next_token_ids = result.next_token_ids.tolist()
         for req, next_token_id in zip(batch.reqs, next_token_ids, strict=True):
             req: Req
@@ -193,12 +280,8 @@ class SchedulerDisaggregationPrefillMixin:
                 # being chunked reqs' prefill is not finished
                 req.is_chunked -= 1
-        # TODO: Not sure if this is necessary
-        if batch.next_batch_sampling_info:
-            batch.next_batch_sampling_info.update_regex_vocab_mask()
-            # We need to remove this for overlap schedule.
-            self.current_stream.synchronize()
-            batch.next_batch_sampling_info.sampling_info_done.set()
+                if self.enable_overlap:
+                    self.send_kv_chunk(req, end_idx=req.tmp_end_idx)
     def process_disagg_prefill_inflight_queue(self: Scheduler) -> None:
         """
@@ -210,7 +293,7 @@ class SchedulerDisaggregationPrefillMixin:
         polls = poll_and_all_reduce(
             [req.disagg_kv_sender for req in self.disagg_prefill_inflight_queue],
-            self.tp_worker.get_tp_cpu_group(),
+            self.attn_tp_cpu_group,
         )
         undone_reqs: List[Req] = []
@@ -243,31 +326,68 @@ class SchedulerDisaggregationPrefillMixin:
                 # only finished requests to running_batch.
                 self.last_batch.filter_batch(chunked_req_to_exclude=self.chunked_req)
                 self.tree_cache.cache_unfinished_req(self.chunked_req)
-                self.send_kv_chunk(self.chunked_req)
+                if (
+                    self.enable_overlap
+                ):  # Delay KV transfer to process_batch_result_disagg_prefill when overlap is enabled to ensure results are resolved
+                    self.chunked_req.tmp_end_idx = min(
+                        len(self.chunked_req.fill_ids),
+                        len(self.chunked_req.origin_input_ids),
+                    )
+                else:
+                    self.send_kv_chunk(self.chunked_req)
                 # chunked request keeps its rid but will get a new req_pool_idx
                 self.req_to_token_pool.free(self.chunked_req.req_pool_idx)
                 self.running_batch.batch_is_full = False
     def send_kv_chunk(
-        self: Scheduler, req: Req, token_id: Optional[int] = None
+        self: Scheduler,
+        req: Req,
+        token_id: Optional[int] = None,
+        end_idx: Optional[int] = None,
     ) -> None:
         """
         Send a prefilled chunk to the decode server
         """
+        page_size = self.token_to_kv_pool_allocator.page_size
         start_idx = req.start_send_idx
-        end_idx = min(len(req.fill_ids), len(req.origin_input_ids))
+        # if end_idx is specified, use it as the end index of the kv chunk because in overlap schedule,
+        # the resolved length is not the same as fill_ids's length
+        end_idx = (
+            end_idx
+            if end_idx is not None
+            else min(len(req.fill_ids), len(req.origin_input_ids))
+        )
+        last_chunk = token_id is not None
+        if (not last_chunk) and (
+            end_idx % page_size != 0
+        ):  # todo: remove the second condition
+            # if not the last chunk and the last page is partial, delay the last partial page to the next send
+            end_idx = end_idx - end_idx % page_size
         # Update next start_send_idx
         req.start_send_idx = end_idx
         kv_indices = (
-            self.req_to_token_pool.req_to_token[req.req_pool_idx][start_idx:end_idx]
+            self.req_to_token_pool.req_to_token[req.req_pool_idx, start_idx:end_idx]
             .cpu()
             .numpy()
         )
-        if token_id is not None:
+        if last_chunk is True:
             self.disagg_prefill_pending_queue.store_prefill_results(
                 req.metadata_buffer_index, token_id
             )
-        is_last = token_id is not None
-        req.disagg_kv_sender.send(kv_indices, slice(start_idx, end_idx), is_last)
+        page_indices = kv_to_page_indices(kv_indices, page_size)
+        page_start_idx = start_idx // page_size
+        page_end_idx = page_start_idx + len(page_indices)
+        if len(page_indices) == 0:
+            logger.info(
+                f"Skip sending kv chunk for request {req.rid=} {req.bootstrap_room=} because page_indices is empty"
+            )
+            return
+        req.disagg_kv_sender.send(
+            page_indices, slice(page_start_idx, page_end_idx), last_chunk
+        )

sglang/srt/disaggregation/utils.py CHANGED Viewed

@@ -4,6 +4,7 @@ from collections import deque
 from enum import Enum
 from typing import List
+import numpy as np
 import torch
 import torch.distributed as dist
@@ -46,6 +47,7 @@ class ReqToMetadataIdxAllocator:
 class TransferBackend(Enum):
     MOONCAKE = "mooncake"
+    NIXL = "nixl"
     FAKE = "fake"
@@ -72,4 +74,34 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
             KVClassType.BOOTSTRAP_SERVER: MooncakeKVBootstrapServer,
         }
         return class_mapping.get(class_type)
+    if transfer_backend == TransferBackend.NIXL:
+        from sglang.srt.disaggregation.nixl import (
+            NixlKVBootstrapServer,
+            NixlKVManager,
+            NixlKVReceiver,
+            NixlKVSender,
+        )
+        class_mapping = {
+            KVClassType.MANAGER: NixlKVManager,
+            KVClassType.SENDER: NixlKVSender,
+            KVClassType.RECEIVER: NixlKVReceiver,
+            KVClassType.BOOTSTRAP_SERVER: NixlKVBootstrapServer,
+        }
+        return class_mapping.get(class_type)
     raise ValueError(f"Unsupported transfer backend: {transfer_backend}")
+def kv_to_page_indices(kv_indices: np.ndarray, page_size: int):
+    # 1. The page is guaruanteed to be full except the last page.
+    # 2. page index = kv_index // page_size
+    # The return vector is kv_indices[::page_size] // page_size
+    if page_size == 1:  # shortcut
+        return kv_indices
+    return kv_indices[::page_size] // page_size
+def kv_to_page_num(num_kv_indices: int, page_size: int):
+    # ceil(num_kv_indices / page_size)
+    return (num_kv_indices + page_size - 1) // page_size

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -279,6 +279,10 @@ class Engine(EngineBase):
         self.shutdown()
         return False
+    def flush_cache(self):
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(self.tokenizer_manager.flush_cache())
     def start_profile(self):
         loop = asyncio.get_event_loop()
         loop.run_until_complete(self.tokenizer_manager.start_profile())

sglang/srt/entrypoints/http_server.py CHANGED Viewed

@@ -25,11 +25,8 @@ import multiprocessing as multiprocessing
 import os
 import threading
 import time
-from ast import Mult
 from http import HTTPStatus
-from typing import AsyncIterator, Callable, Dict, Optional, Union
-from sglang.srt.model_executor.model_runner import LocalSerializedTensor
+from typing import AsyncIterator, Callable, Dict, Optional
 # Fix a bug of Python threading
 setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
@@ -84,7 +81,6 @@ from sglang.srt.openai_api.protocol import ModelCard, ModelList
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import (
-    MultiprocessingSerializer,
     add_api_key_middleware,
     add_prometheus_middleware,
     delete_directory,
@@ -315,11 +311,11 @@ async def classify_request(obj: EmbeddingReqInput, request: Request):
 @app.api_route("/flush_cache", methods=["GET", "POST"])
 async def flush_cache():
     """Flush the radix cache."""
-    _global_state.tokenizer_manager.flush_cache()
+    ret = await _global_state.tokenizer_manager.flush_cache()
     return Response(
         content="Cache flushed.\nPlease check backend logs for more details. "
         "(When there are running or waiting requests, the operation will not be performed.)\n",
-        status_code=200,
+        status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
     )

sglang/srt/entrypoints/verl_engine.py CHANGED Viewed

@@ -12,18 +12,17 @@
 # limitations under the License.
 # ==============================================================================
 import os
-from typing import Dict, List, Literal, Optional, Tuple, Union
+from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union
 import torch
 import torch.distributed as dist
 from PIL.Image import Image
 from torch.distributed.tensor import DeviceMesh, DTensor
+from sglang.srt.entrypoints.engine import Engine
 from sglang.srt.entrypoints.http_server_engine import HttpServerEngineAdapter
 from sglang.srt.model_executor.model_runner import LocalSerializedTensor
 from sglang.srt.patch_torch import monkey_patch_torch_reductions
-from sglang.srt.server import Engine
-from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import MultiprocessingSerializer, broadcast_pyobj
@@ -125,7 +124,7 @@ class VerlEngine:
     def update_weights_from_tensor(
         self,
-        named_tensors: List[Tuple[str, torch.Tensor]],
+        named_tensors: Iterable[Tuple[str, torch.Tensor]],
         load_format: Optional[str] = None,
     ):
         # Most naive implementation, can optimize a lot if it is bottleneck
@@ -154,9 +153,12 @@ class VerlEngine:
                         )
                     ],
                     load_format=load_format,
-                    flush_cache=tensor_index == len(named_tensors) - 1,
+                    flush_cache=False,
                 )
+        if self._tp_rank == 0:
+            self._engine.tokenizer_manager.flush_cache()
     def release_memory_occupation(self):
         if self._tp_rank == 0:
             self._engine.release_memory_occupation()

sglang/srt/function_call_parser.py CHANGED Viewed

@@ -25,6 +25,7 @@ TOOLS_TAG_LIST = [
     "<tool_call>",
     "<|python_tag|>",
     "[TOOL_CALLS]",
+    "<｜tool▁calls▁begin｜>",
 ]
@@ -477,6 +478,64 @@ class Llama32Detector(BaseFormatDetector):
         )
+class DeepSeekV3Detector(BaseFormatDetector):
+    """
+    Detector for DeepSeek models.
+    Assumes function call format:
+      '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>get_current_weather\n```json\n{"location": "Tokyo"}\n```<｜tool▁call▁end｜>\n<｜tool▁call▁begin｜>function<｜tool▁sep｜>get_current_weather\n```json\n{"location": "Paris"}\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜><｜end▁of▁sentence｜>
+    """
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<｜tool▁calls▁begin｜>"
+        self.eot_token = "<｜tool▁calls▁end｜>"
+        self.func_call_regex = r"<｜tool▁call▁begin｜>.*?<｜tool▁call▁end｜>"
+        self.func_detail_regex = r"<｜tool▁call▁begin｜>(.*)<｜tool▁sep｜>(.*)\n```json\n(.*)\n```<｜tool▁call▁end｜>"
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a deepseek format tool call."""
+        return self.bot_token in text
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+        match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
+        calls = []
+        try:
+            for match_result in match_result_list:
+                # Get function name
+                func_detail = re.search(self.func_detail_regex, match_result, re.DOTALL)
+                func_name = func_detail.group(2)
+                func_args = func_detail.group(3)
+                func_args = json.loads(func_args)
+                # construct match_result for parse_base_json
+                match_result = {"name": func_name, "parameters": func_args}
+                calls.extend(self.parse_base_json(match_result, tools))
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # return the normal text if parsing fails
+            return StreamingParseResult(normal_text=text)
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin="<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>"
+            + name
+            + "\n```json\n",
+            end="\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
+            trigger="<｜tool▁calls▁begin｜>",
+        )
 class MultiFormatParser:
     def __init__(self, detectors: List[BaseFormatDetector]):
         """
@@ -543,6 +602,7 @@ class FunctionCallParser:
         "llama3": Llama32Detector,
         "qwen25": Qwen25Detector,
         "mistral": MistralDetector,
+        "deepseekv3": DeepSeekV3Detector,
     }
     def __init__(self, tools: List[Tool], tool_call_parser: str):

sglang/srt/layers/activation.py CHANGED Viewed

@@ -21,13 +21,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from sglang.srt.utils import is_cuda_available
-_is_cuda = is_cuda_available()
-if _is_cuda:
-    from sgl_kernel import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
 from sglang.srt.custom_op import CustomOp
 from sglang.srt.distributed import (
     divide,
@@ -35,7 +28,12 @@ from sglang.srt.distributed import (
     get_tensor_model_parallel_world_size,
 )
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
-from sglang.srt.utils import set_weight_attrs
+from sglang.srt.utils import is_cuda, set_weight_attrs
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
 logger = logging.getLogger(__name__)

sglang 0.4.5.post1__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post3py3-none-any.whl