PyPI - sglang - Versions diffs - 0.4.2.post2__tar.gz → 0.4.2.post4__tar.gz - Mend

sglang 0.4.2.post2tar.gz → 0.4.2.post4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (421) hide show

{sglang-0.4.2.post2/sglang.egg-info → sglang-0.4.2.post4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: sglang
-Version: 0.4.2.post2
+Version: 0.4.2.post4
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -239,11 +239,11 @@ Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
-Requires-Dist: sgl-kernel>=0.0.3.post1; extra == "srt"
+Requires-Dist: sgl-kernel>=0.0.3.post3; extra == "srt"
 Requires-Dist: torch; extra == "srt"
-Requires-Dist: vllm==0.6.4.post1; extra == "srt"
+Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
 Requires-Dist: flashinfer_python>=0.2.0.post2; extra == "srt"
-Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt"
+Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"

{sglang-0.4.2.post2 → sglang-0.4.2.post4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.4.2.post2"
+version = "0.4.2.post4"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -25,8 +25,8 @@ runtime_common = [
 ]
 srt = [
     "sglang[runtime_common]", "cuda-python",
-    "sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.4.post1",
-    "flashinfer_python>=0.2.0.post2", "outlines>=0.0.44,<0.1.0"
+    "sgl-kernel>=0.0.3.post3", "torch", "vllm>=0.6.4.post1,<=0.7.2",
+    "flashinfer_python>=0.2.0.post2", "outlines>=0.0.44,<=0.1.11"
 ]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD

{sglang-0.4.2.post2 → sglang-0.4.2.post4}/sglang/check_env.py RENAMED Viewed

@@ -19,6 +19,7 @@ def is_cuda_v2():
 # List of packages to check versions
 PACKAGE_LIST = [
     "sglang",
+    "sgl_kernel",
     "flashinfer",
     "triton",
     "transformers",

{sglang-0.4.2.post2 → sglang-0.4.2.post4}/sglang/srt/constrained/outlines_backend.py RENAMED Viewed

@@ -35,7 +35,10 @@ is_hip_ = is_hip()
 if is_hip_:
     from outlines_core.fsm.json_schema import build_regex_from_schema
 else:
-    from outlines.fsm.json_schema import build_regex_from_schema
+    try:
+        from outlines.fsm.json_schema import build_regex_from_schema
+    except ImportError:
+        from outlines_core.fsm.json_schema import build_regex_from_schema
 logger = logging.getLogger(__name__)

{sglang-0.4.2.post2 → sglang-0.4.2.post4}/sglang/srt/function_call_parser.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import json
+import logging
 import re
 from abc import ABC, abstractmethod
 from json import JSONDecodeError, JSONDecoder
@@ -8,6 +9,8 @@ import partial_json_parser
 from partial_json_parser.core.options import Allow
 from pydantic import BaseModel, Field
+logger = logging.getLogger(__name__)
 TOOLS_TAG_LIST = [
     "<|plugin|>",
     "<function=",
@@ -88,17 +91,43 @@ class BaseFormatDetector:
         self.bot_token = ""
         self.eot_token = ""
-    def parse_base_json(self, action: Dict, tools: List[Function]):
-        name, parameters = action["name"], json.dumps(
-            action.get("parameters", action.get("arguments", {})),
-            ensure_ascii=False,
-        )
-        tool_index = [tool.function.name for tool in tools].index(name)
-        tool_call_item = ToolCallItem(
-            tool_index=tool_index, name=name, parameters=parameters
-        )
-        calls = [tool_call_item]
-        return calls
+    def parse_base_json(self, action: Any, tools: List[Function]) -> List[ToolCallItem]:
+        tool_indices = {
+            tool.function.name: i for i, tool in enumerate(tools) if tool.function.name
+        }
+        if not isinstance(action, list):
+            name = action.get("name")
+            if not name or name not in tool_indices:
+                logger.warning(f"Model attempted to call undefined function: {name}")
+                return []
+            return [
+                ToolCallItem(
+                    tool_index=tool_indices[name],
+                    name=name,
+                    parameters=json.dumps(
+                        action.get("parameters") or action.get("arguments", {}),
+                        ensure_ascii=False,
+                    ),
+                )
+            ]
+        results = []
+        for act in action:
+            name = act.get("name")
+            if name and name in tool_indices:
+                results.append(
+                    ToolCallItem(
+                        tool_index=tool_indices[name],
+                        name=name,
+                        parameters=json.dumps(
+                            act.get("parameters") or act.get("arguments", {}),
+                            ensure_ascii=False,
+                        ),
+                    )
+                )
+        return results
     def detect_and_parse(self, text: str, tools: List[Function]) -> List[ToolCallItem]:
         """
@@ -112,9 +141,7 @@ class BaseFormatDetector:
         self, new_text: str, tools: List[Function]
     ) -> StreamingParseResult:
         """
-        Streaming incremental parsing, referencing the logic of Llama32Detector.
-        We partially parse JSON within <tool_call>...</tool_call>, and handle
-        incremental argument output.
+        Streaming incremental parsing with tool validation.
         """
         # Append new text to buffer
         self._buffer += new_text
@@ -125,17 +152,19 @@ class BaseFormatDetector:
                 new_text = new_text.replace(self.eot_token, "")
             return StreamingParseResult(normal_text=new_text)
-        # bit mask flags for partial JSON parsing. If the name hasn't been
-        # sent yet, don't allow sending
-        # an incomplete string since OpenAI only ever (as far as I have
-        # seen) allows sending the entire tool/ function name at once.
+        # Build tool indices if not already built
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = {
+                tool.function.name: i
+                for i, tool in enumerate(tools)
+                if tool.function and tool.function.name
+            }
         flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
         try:
             tool_call_arr = []
             is_complete = []
             try:
-                # depending on the prompt format the Llama model may or may not
-                # prefix the output with the <|python_tag|> token
                 start_idx = (
                     len(self.bot_token)
                     if current_text.startswith(self.bot_token)
@@ -149,8 +178,18 @@ class BaseFormatDetector:
                         _is_complete_json(current_text[start_idx : start_idx + end_idx])
                     )
                     start_idx += end_idx + len("; ")
-                    # depending on the prompt Llama can use
-                    # either arguments or parameters
+                    # Validate tool name if present
+                    if "name" in obj and obj["name"] not in self._tool_indices:
+                        # Invalid tool name - reset state
+                        self._buffer = ""
+                        self.current_tool_id = -1
+                        self.current_tool_name_sent = False
+                        if self.streamed_args_for_tool:
+                            self.streamed_args_for_tool.pop()
+                        return StreamingParseResult()
+                    # Handle parameters/arguments consistency
                     if "parameters" in obj:
                         assert (
                             "arguments" not in obj
@@ -159,29 +198,17 @@ class BaseFormatDetector:
                     tool_call_arr.append(obj)
             except partial_json_parser.core.exceptions.MalformedJSON:
-                # not enough tokens to parse into JSON yet
                 return StreamingParseResult()
-            # select as the current tool call the one we're on the state at
-            current_tool_call: Dict = (
-                tool_call_arr[self.current_tool_id] if len(tool_call_arr) > 0 else {}
-            )
-            # case -- if no tokens have been streamed for the tool, e.g.
-            #   only the array brackets, stream nothing
             if len(tool_call_arr) == 0:
                 return StreamingParseResult()
-            # case: we are starting a new tool in the array
-            #   -> array has > 0 length AND length has moved past cursor
-            elif (
-                len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1
-            ):
+            current_tool_call: Dict = (
+                tool_call_arr[self.current_tool_id] if len(tool_call_arr) > 0 else {}
+            )
-                # if we're moving on to a new call, first make sure we
-                # haven't missed anything in the previous one that was
-                # auto-generated due to JSON completions, but wasn't
-                # streamed to the client yet.
+            # Handle new tool in array
+            if len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1:
                 if self.current_tool_id >= 0:
                     cur_arguments = current_tool_call.get("arguments")
                     if cur_arguments:
@@ -190,7 +217,6 @@ class BaseFormatDetector:
                         argument_diff = cur_args_json[sent:]
                         res = StreamingParseResult(
-                            normal_text=None,
                             calls=[
                                 ToolCallItem(
                                     tool_index=self.current_tool_id,
@@ -206,23 +232,20 @@ class BaseFormatDetector:
                         res = StreamingParseResult()
                 else:
                     res = StreamingParseResult()
-                # re-set stuff pertaining to progress in the current tool
                 self.current_tool_id = len(tool_call_arr) - 1
                 self.current_tool_name_sent = False
                 self.streamed_args_for_tool.append("")
-                print("starting on new tool %d", self.current_tool_id)
                 return res
-            # if the current tool name hasn't been sent, send if available
-            # - otherwise send nothing
+            # Handle tool name
             elif not self.current_tool_name_sent:
                 function_name = current_tool_call.get("name")
-                if function_name:
+                if function_name and function_name in self._tool_indices:
                     res = StreamingParseResult(
-                        normal_text=None,
                         calls=[
                             ToolCallItem(
-                                tool_index=self.current_tool_id,
+                                tool_index=self._tool_indices[function_name],
                                 name=function_name,
                                 parameters="",
                             )
@@ -232,8 +255,7 @@ class BaseFormatDetector:
                 else:
                     res = StreamingParseResult()
-            # now we know we're on the same tool call and we're streaming
-            # arguments
+            # Handle streaming arguments
             else:
                 cur_arguments = current_tool_call.get("arguments")
                 res = StreamingParseResult()
@@ -250,13 +272,12 @@ class BaseFormatDetector:
                         argument_diff = cur_args_json[sent:]
                         self._buffer = ""
                         self.prev_tool_call_arr[self.current_tool_id].clear()
-                        self.current_tool_name_sent: bool = False
+                        self.current_tool_name_sent = False
                         self.streamed_args_for_tool[self.current_tool_id] = ""
                     elif prev_arguments:
                         prev_args_json = json.dumps(prev_arguments)
                         if cur_args_json != prev_args_json:
                             prefix = _find_common_prefix(prev_args_json, cur_args_json)
                             argument_diff = prefix[sent:]
@@ -279,8 +300,7 @@ class BaseFormatDetector:
             return res
         except Exception as e:
-            print(e)
-            # Skipping chunk as a result of tool streaming extraction error
+            logger.error(f"Error in parse_streaming_increment: {e}")
             return StreamingParseResult()
@@ -372,31 +392,38 @@ class Llama32Detector(BaseFormatDetector):
     Detector for Llama 3.2 models.
     Assumes function call format:
       <|python_tag|>{"name":"xxx", "arguments":{...}}
-    Does not require a closing tag "</python_tag|>",
-    relies on json.loads(...) success to determine if JSON is complete.
     """
     def __init__(self):
-        """
-        Initializes the detector with necessary state variables.
-        """
         super().__init__()
         self.bot_token = "<|python_tag|>"
     def detect_and_parse(self, text: str, tools: List[Function]) -> List[ToolCallItem]:
-        """
-        One-time parsing: Detects and parses tool calls in the provided text.
-        :param text: The complete text to parse.
-        :param tools: List of available tools.
-        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
-        """
+        """Parse function calls from text, handling multiple JSON objects."""
         if "<|python_tag|>" not in text:
             return []
-        _, action = text.split("<|python_tag|>")
-        action = json.loads(action)
-        return self.parse_base_json(action, tools)
+        _, action_text = text.split("<|python_tag|>")
+        # Split by semicolon and process each part
+        json_parts = [part.strip() for part in action_text.split(";") if part.strip()]
+        all_actions = []
+        for part in json_parts:
+            try:
+                # Parse each individual JSON object
+                action = json.loads(part)
+                all_actions.append(action)
+            except json.JSONDecodeError as e:
+                logger.warning(f"Failed to parse JSON part: {part}")
+                logger.warning(f"JSON parse error: {str(e)}")
+                continue
+        # Only process if we found valid JSON objects
+        if all_actions:
+            return self.parse_base_json(all_actions, tools)
+        return []
 class MultiFormatParser:

{sglang-0.4.2.post2 → sglang-0.4.2.post4}/sglang/srt/layers/attention/double_sparsity_backend.py RENAMED Viewed

@@ -17,12 +17,10 @@ class DoubleSparseAttnBackend(AttentionBackend):
     def __init__(self, model_runner: ModelRunner):
         # Lazy import to avoid the initialization of cuda context
         from sglang.srt.layers.attention.triton_ops.double_sparsity_attention import (
+            extend_attention_fwd,
             flash_decode_attention_fwd,
             flash_decode_sparse_attention_fwd,
         )
-        from sglang.srt.layers.attention.triton_ops.extend_attention import (
-            extend_attention_fwd,
-        )
         super().__init__()

{sglang-0.4.2.post2 → sglang-0.4.2.post4}/sglang/srt/layers/attention/flashinfer_backend.py RENAMED Viewed

@@ -70,6 +70,8 @@ class FlashInferAttnBackend(AttentionBackend):
     ):
         super().__init__()
+        self.is_multimodal = model_runner.model_config.is_multimodal
         # Parse constants
         self.decode_use_tensor_cores = should_use_tensor_core(
             kv_cache_dtype=model_runner.kv_cache_dtype,
@@ -130,12 +132,8 @@ class FlashInferAttnBackend(AttentionBackend):
             for _ in range(self.num_wrappers)
         ]
-        # Create wrappers
-        # NOTE: we do not use ragged attention when there are multiple wrappers
-        self.prefill_wrapper_ragged = (
-            BatchPrefillWithRaggedKVCacheWrapper(self.workspace_buffer, "NHD")
-            if self.num_wrappers == 1
-            else None
+        self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
+            self.workspace_buffer, "NHD"
         )
         # Two wrappers: one for sliding window attention and one for full attention.
@@ -217,13 +215,12 @@ class FlashInferAttnBackend(AttentionBackend):
         else:
             prefix_lens = forward_batch.extend_prefix_lens
-            # Some heuristics to check whether to use ragged forward
-            if forward_batch.extend_num_tokens >= 4096 and self.num_wrappers == 1:
-                use_ragged = True
-                extend_no_prefix = not any(forward_batch.extend_prefix_lens_cpu)
-            else:
+            if self.is_multimodal:
                 use_ragged = False
                 extend_no_prefix = False
+            else:
+                use_ragged = True
+                extend_no_prefix = not any(forward_batch.extend_prefix_lens_cpu)
             self.indices_updater_prefill.update(
                 forward_batch.req_pool_indices,
@@ -409,9 +406,9 @@ class FlashInferAttnBackend(AttentionBackend):
             )
         else:
             o1, s1 = self.prefill_wrapper_ragged.forward_return_lse(
-                q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
-                k.contiguous().view(-1, layer.tp_k_head_num, layer.head_dim),
-                v.contiguous().view(-1, layer.tp_v_head_num, layer.head_dim),
+                q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                k.view(-1, layer.tp_k_head_num, layer.head_dim),
+                v.view(-1, layer.tp_v_head_num, layer.head_dim),
                 causal=True,
                 sm_scale=layer.scaling,
                 logits_soft_cap=logits_soft_cap,
@@ -640,7 +637,6 @@ class FlashInferIndicesUpdaterDecode:
             kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
             bs = kv_indptr.shape[0] - 1
-        wrapper.end_forward()
         wrapper.begin_forward(
             kv_indptr,
             kv_indices,
@@ -651,6 +647,7 @@ class FlashInferIndicesUpdaterDecode:
             1,
             data_type=self.data_type,
             q_data_type=self.q_data_type,
+            non_blocking=True,
         )
@@ -860,7 +857,6 @@ class FlashInferIndicesUpdaterPrefill:
         # extend part
         if use_ragged:
-            wrapper_ragged.end_forward()
             wrapper_ragged.begin_forward(
                 qo_indptr,
                 qo_indptr,
@@ -871,7 +867,6 @@ class FlashInferIndicesUpdaterPrefill:
             )
         # cached part
-        wrapper_paged.end_forward()
         wrapper_paged.begin_forward(
             qo_indptr,
             kv_indptr,
@@ -883,6 +878,7 @@ class FlashInferIndicesUpdaterPrefill:
             1,
             q_data_type=self.q_data_type,
             custom_mask=custom_mask,
+            non_blocking=True,
         )
@@ -924,38 +920,50 @@ class FlashInferMultiStepDraftBackend:
         self.max_context_len = self.attn_backends[0].max_context_len
         # Cached variables for generate_draft_decode_kv_indices
         self.pool_len = model_runner.req_to_token_pool.req_to_token.shape[1]
-        self.kv_indptr_stride = self.kv_indptr.shape[1]
-    def common_template(self, forward_batch: ForwardBatch, call_fn: int):
+    def common_template(
+        self, forward_batch: ForwardBatch, kv_indices_buffer: torch.Tensor, call_fn: int
+    ):
         num_seqs = forward_batch.batch_size
         bs = self.topk * num_seqs
         seq_lens_sum = forward_batch.seq_lens_sum
         self.generate_draft_decode_kv_indices[
             (self.speculative_num_steps, num_seqs, self.topk)
         ](
             forward_batch.req_pool_indices,
             forward_batch.req_to_token_pool.req_to_token,
             forward_batch.seq_lens,
-            self.cuda_graph_kv_indices,
+            kv_indices_buffer,
             self.kv_indptr,
             forward_batch.positions,
             num_seqs,
             self.topk,
             self.pool_len,
-            self.kv_indptr_stride,
+            kv_indices_buffer.shape[1],
             self.kv_indptr.shape[1],
             triton.next_power_of_2(num_seqs),
             triton.next_power_of_2(self.speculative_num_steps),
             triton.next_power_of_2(bs),
         )
         for i in range(self.speculative_num_steps):
             forward_batch.spec_info.kv_indptr = self.kv_indptr[i, : bs + 1]
-            forward_batch.spec_info.kv_indices = self.cuda_graph_kv_indices[i][
+            forward_batch.spec_info.kv_indices = kv_indices_buffer[i][
                 : seq_lens_sum * self.topk + bs * (i + 1)
             ]
             call_fn(i, forward_batch)
     def init_forward_metadata(self, forward_batch: ForwardBatch):
+        kv_indices = torch.zeros(
+            (
+                self.speculative_num_steps,
+                forward_batch.batch_size * self.topk * self.max_context_len,
+            ),
+            dtype=torch.int32,
+            device="cuda",
+        )
         def call_fn(i, forward_batch):
             forward_batch.spec_info.kv_indptr = (
                 forward_batch.spec_info.kv_indptr.clone()
@@ -965,7 +973,7 @@ class FlashInferMultiStepDraftBackend:
             )
             self.attn_backends[i].init_forward_metadata(forward_batch)
-        self.common_template(forward_batch, call_fn)
+        self.common_template(forward_batch, kv_indices, call_fn)
     def init_cuda_graph_state(self, max_bs: int):
         self.cuda_graph_kv_indices = torch.zeros(
@@ -973,7 +981,6 @@ class FlashInferMultiStepDraftBackend:
             dtype=torch.int32,
             device="cuda",
         )
-        self.kv_indptr_stride = self.cuda_graph_kv_indices.shape[1]
         for i in range(self.speculative_num_steps):
             self.attn_backends[i].init_cuda_graph_state(
                 max_bs, kv_indices_buf=self.cuda_graph_kv_indices[i]
@@ -995,7 +1002,7 @@ class FlashInferMultiStepDraftBackend:
             ][0]
             decode_wrapper.begin_forward = partial(fast_decode_plan, decode_wrapper)
-        self.common_template(forward_batch, call_fn)
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
     def init_forward_metadata_replay_cuda_graph(self, forward_batch):
         def call_fn(i, forward_batch):
@@ -1009,7 +1016,7 @@ class FlashInferMultiStepDraftBackend:
                 spec_info=forward_batch.spec_info,
             )
-        self.common_template(forward_batch, call_fn)
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
 @triton.jit
@@ -1070,21 +1077,6 @@ def should_use_tensor_core(
     if env_override is not None:
         return env_override.lower() == "true"
-    # Try to use _grouped_size_compiled_for_decode_kernels if available
-    # This is for flashinfer <=0.1.6. Otherwise, there is an accuracy bug
-    try:
-        from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
-        if not _grouped_size_compiled_for_decode_kernels(
-            num_attention_heads,
-            num_kv_heads,
-        ):
-            return True
-        else:
-            return False
-    except (ImportError, AttributeError):
-        pass
     # Calculate GQA group size
     gqa_group_size = num_attention_heads // num_kv_heads
@@ -1114,6 +1106,7 @@ def fast_decode_plan(
     sm_scale: Optional[float] = None,
     rope_scale: Optional[float] = None,
     rope_theta: Optional[float] = None,
+    **kwargs,
 ) -> None:
     """A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for FlashInferMultiStepDraftBackend."""
     batch_size = len(last_page_len)

sglang 0.4.2.post2__tar.gz → 0.4.2.post4__tar.gz

sglang 0.4.2.post2tar.gz → 0.4.2.post4tar.gz