PyPI - sglang - Versions diffs - 0.4.9.post4__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl - Mend

sglang 0.4.9.post4py3-none-any.whl → 0.4.9.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

sglang/lang/chat_template.py +21 -0
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/model_config.py +4 -0
sglang/srt/constrained/base_grammar_backend.py +10 -2
sglang/srt/constrained/xgrammar_backend.py +7 -5
sglang/srt/conversation.py +16 -1
sglang/srt/debug_utils/__init__.py +0 -0
sglang/srt/debug_utils/dump_comparator.py +131 -0
sglang/srt/debug_utils/dumper.py +108 -0
sglang/srt/debug_utils/text_comparator.py +172 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
sglang/srt/disaggregation/mooncake/conn.py +16 -0
sglang/srt/disaggregation/prefill.py +13 -1
sglang/srt/entrypoints/engine.py +4 -2
sglang/srt/entrypoints/openai/serving_chat.py +132 -79
sglang/srt/function_call/ebnf_composer.py +10 -3
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +164 -0
sglang/srt/function_call/qwen3_coder_detector.py +1 -0
sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
sglang/srt/layers/attention/vision.py +56 -8
sglang/srt/layers/layernorm.py +26 -1
sglang/srt/layers/logits_processor.py +14 -3
sglang/srt/layers/moe/ep_moe/layer.py +172 -206
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
sglang/srt/layers/moe/topk.py +84 -22
sglang/srt/layers/multimodal.py +11 -8
sglang/srt/layers/quantization/fp8.py +25 -247
sglang/srt/layers/quantization/fp8_kernel.py +78 -48
sglang/srt/layers/quantization/modelopt_quant.py +25 -10
sglang/srt/layers/quantization/unquant.py +24 -76
sglang/srt/layers/quantization/w4afp8.py +68 -17
sglang/srt/lora/lora_registry.py +93 -29
sglang/srt/managers/cache_controller.py +9 -7
sglang/srt/managers/mm_utils.py +154 -35
sglang/srt/managers/multimodal_processor.py +3 -14
sglang/srt/managers/schedule_batch.py +14 -8
sglang/srt/managers/scheduler.py +35 -1
sglang/srt/managers/tokenizer_manager.py +37 -6
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/mem_cache/hiradix_cache.py +5 -2
sglang/srt/model_executor/model_runner.py +68 -14
sglang/srt/models/deepseek_v2.py +62 -28
sglang/srt/models/glm4_moe.py +1035 -0
sglang/srt/models/glm4_moe_nextn.py +167 -0
sglang/srt/models/interns1.py +328 -0
sglang/srt/models/internvl.py +143 -47
sglang/srt/models/llava.py +9 -5
sglang/srt/models/minicpmo.py +4 -1
sglang/srt/models/qwen2_moe.py +2 -2
sglang/srt/models/qwen3_moe.py +5 -2
sglang/srt/multimodal/processors/base_processor.py +20 -6
sglang/srt/multimodal/processors/clip.py +2 -2
sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
sglang/srt/multimodal/processors/gemma3.py +2 -2
sglang/srt/multimodal/processors/gemma3n.py +2 -2
sglang/srt/multimodal/processors/internvl.py +21 -8
sglang/srt/multimodal/processors/janus_pro.py +2 -2
sglang/srt/multimodal/processors/kimi_vl.py +2 -2
sglang/srt/multimodal/processors/llava.py +4 -4
sglang/srt/multimodal/processors/minicpm.py +2 -3
sglang/srt/multimodal/processors/mlama.py +2 -2
sglang/srt/multimodal/processors/mllama4.py +18 -111
sglang/srt/multimodal/processors/phi4mm.py +2 -2
sglang/srt/multimodal/processors/pixtral.py +2 -2
sglang/srt/multimodal/processors/qwen_audio.py +2 -2
sglang/srt/multimodal/processors/qwen_vl.py +2 -2
sglang/srt/multimodal/processors/vila.py +3 -1
sglang/srt/reasoning_parser.py +2 -1
sglang/srt/server_args.py +57 -6
sglang/srt/utils.py +96 -1
sglang/srt/weight_sync/utils.py +119 -0
sglang/test/runners.py +4 -0
sglang/test/test_utils.py +65 -5
sglang/utils.py +19 -0
sglang/version.py +1 -1
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +4 -4
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +83 -73
sglang/srt/debug_utils.py +0 -74
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0

sglang/lang/chat_template.py CHANGED Viewed

@@ -448,6 +448,19 @@ register_chat_template(
     )
 )
+register_chat_template(
+    ChatTemplate(
+        name="interns1",
+        default_system_prompt="You are an AI assistant whose name is Intern-S1 (书生大模型).\n- Intern-S1 (书生大模型) is a vision-language model that is developed by Shanghai AI Laboratory (上海人工智能实验室).  It is designed to be helpful, honest, and harmless.\n- Intern-S1 (书生大模型) can understand and communicate fluently in the language chosen by the user such as English and 中文.\nYou are an expert reasoner with extensive experience in all areas. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        stop_str=["<|im_end|>", "<|action_end|>"],
+    )
+)
 register_chat_template(
     ChatTemplate(
         name="granite-3-instruct",
@@ -609,6 +622,14 @@ def match_internvl_chat(model_path: str):
         return "internvl-2-5"
+@register_chat_template_matching_function
+def match_interns1_chat(model_path: str):
+    if re.search(r"intern-s1", model_path, re.IGNORECASE):
+        return "interns1"
+    if re.search(r"interns1", model_path, re.IGNORECASE):
+        return "interns1"
 if __name__ == "__main__":
     messages = [
         {"role": "system", "content": None},  # None means default

sglang/srt/configs/internvl.py CHANGED Viewed

@@ -10,6 +10,7 @@ from transformers import (
     PretrainedConfig,
     PreTrainedTokenizer,
     Qwen2Config,
+    Qwen3Config,
 )
 from sglang.utils import logger
@@ -314,6 +315,8 @@ class InternVLChatConfig(PretrainedConfig):
             self.llm_config = InternLM2Config(**llm_config)
         elif llm_config.get("architectures")[0] == "Qwen2ForCausalLM":
             self.llm_config = Qwen2Config(**llm_config)
+        elif llm_config.get("architectures")[0] == "Qwen3MoeForCausalLM":
+            self.llm_config = Qwen3Config(**llm_config)
         else:
             raise ValueError(
                 "Unsupported architecture: {}".format(

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -127,6 +127,9 @@ class ModelConfig:
         ):
             self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
+        if is_draft_model and self.hf_config.architectures[0] == "Glm4MoeForCausalLM":
+            self.hf_config.architectures[0] = "Glm4MoeForCausalLMNextN"
         if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
             self.hf_config.architectures[0] = "MiMoMTP"
         # Check model type
@@ -635,6 +638,7 @@ multimodal_model_archs = [
     "Qwen2_5_VLForConditionalGeneration",
     "KimiVLForConditionalGeneration",
     "InternVLChatModel",
+    "InternS1ForConditionalGeneration",
     "Phi4MMForCausalLM",
     "VILAForConditionalGeneration",
 ]

sglang/srt/constrained/base_grammar_backend.py CHANGED Viewed

@@ -168,7 +168,10 @@ class BaseGrammarBackend:
 def create_grammar_backend(
-    server_args: ServerArgs, tokenizer, vocab_size: int
+    server_args: ServerArgs,
+    tokenizer,
+    vocab_size: int,
+    eos_token_ids: Optional[set] = None,
 ) -> Optional[BaseGrammarBackend]:
     if server_args.grammar_backend == "outlines":
         from sglang.srt.constrained.outlines_backend import OutlinesGrammarBackend
@@ -180,7 +183,12 @@ def create_grammar_backend(
     elif server_args.grammar_backend == "xgrammar":
         from sglang.srt.constrained.xgrammar_backend import XGrammarGrammarBackend
-        grammar_backend = XGrammarGrammarBackend(tokenizer, vocab_size=vocab_size)
+        # Convert Set[int] to List[int] if needed
+        eos_list = list(eos_token_ids) if eos_token_ids else None
+        grammar_backend = XGrammarGrammarBackend(
+            tokenizer, vocab_size=vocab_size, model_eos_token_ids=eos_list
+        )
     elif server_args.grammar_backend == "llguidance":
         from sglang.srt.constrained.llguidance_backend import GuidanceBackend

sglang/srt/constrained/xgrammar_backend.py CHANGED Viewed

@@ -150,14 +150,16 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
         self,
         tokenizer,
         vocab_size: int,
+        model_eos_token_ids: Optional[List[int]] = None,
     ):
         super().__init__()
-        if True:
-            tokenizer_info = TokenizerInfo.from_huggingface(
-                tokenizer, vocab_size=vocab_size
-            )
-            override_stop_tokens = None
+        # Create TokenizerInfo with model's EOS tokens as the authoritative stop tokens
+        # This ensures consistency between what the model considers EOS and what XGrammar uses
+        tokenizer_info = TokenizerInfo.from_huggingface(
+            tokenizer, vocab_size=vocab_size, stop_token_ids=model_eos_token_ids
+        )
+        override_stop_tokens = None
         self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
         self.vocab_size = vocab_size

sglang/srt/conversation.py CHANGED Viewed

@@ -623,7 +623,7 @@ def generate_chat_conv(
                         real_content += content.text
                     elif content.type == "image_url":
                         # NOTE: works for llava and intervl2_5
-                        if conv.name == "internvl-2-5":
+                        if conv.name in ["internvl-2-5", "interns1"]:
                             real_content = image_token + real_content
                         else:
                             real_content += image_token
@@ -817,6 +817,19 @@ register_conv_template(
     )
 )
+register_conv_template(
+    Conversation(
+        name="interns1",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are an AI assistant whose name is Intern-S1 (书生大模型).\n- Intern-S1 (书生大模型) is a vision-language model that is developed by Shanghai AI Laboratory (上海人工智能实验室).  It is designed to be helpful, honest, and harmless.\n- Intern-S1 (书生大模型) can understand and communicate fluently in the language chosen by the user such as English and 中文.\nYou are an expert reasoner with extensive experience in all areas. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.",
+        roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+        sep_style=SeparatorStyle.MPT,
+        sep="<|im_end|>\n",
+        stop_str=["<|im_end|>", "<|action_end|>"],
+        image_token="<image>",
+    )
+)
 # Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
 register_conv_template(
     Conversation(
@@ -986,6 +999,8 @@ register_conv_template(
 def match_internvl(model_path: str):
     if re.search(r"internvl", model_path, re.IGNORECASE):
         return "internvl-2-5"
+    if re.search(r"intern.*s1", model_path, re.IGNORECASE):
+        return "interns1"
 @register_conv_template_matching_function

sglang/srt/debug_utils/__init__.py ADDED Viewed

File without changes

sglang/srt/debug_utils/dump_comparator.py ADDED Viewed

@@ -0,0 +1,131 @@
+import argparse
+import functools
+import re
+from pathlib import Path
+import polars as pl
+import torch
+from sglang.srt.debug_utils.dumper import get_truncated_value
+def main(args):
+    df_target = read_meta(args.target_path)
+    df_target = df_target.sort("rank", "dump_index")
+    df_target = df_target.filter(
+        (pl.col("forward_pass_id") >= args.start_id)
+        & (pl.col("forward_pass_id") <= args.end_id)
+    )
+    assert all(
+        c in df_target.columns
+        for c in ["rank", "forward_pass_id", "dump_index", "name"]
+    )
+    df_baseline = read_meta(args.baseline_path)
+    print("df_target", df_target)
+    print("df_baseline", df_baseline)
+    for row in df_target.iter_rows(named=True):
+        rows_baseline = df_baseline.filter(
+            (
+                pl.col("forward_pass_id")
+                == row["forward_pass_id"] - args.start_id + args.baseline_start_id
+            )
+            & functools.reduce(
+                lambda a, b: a & b,
+                [
+                    pl.col(col) == row[col]
+                    for col in row.keys()
+                    if col not in ["forward_pass_id", "dump_index", "filename"]
+                ],
+            )
+        )
+        assert len(rows_baseline) == 1, f"{rows_baseline=}"
+        row_baseline = rows_baseline.to_dicts()[0]
+        path_baseline = Path(args.baseline_path) / row_baseline["filename"]
+        path_target = Path(args.target_path) / row["filename"]
+        print(f"Check: target={str(path_target)} baseline={str(path_baseline)}")
+        check_tensor_pair(path_baseline=path_baseline, path_target=path_target)
+        print()
+def read_meta(directory):
+    directory = Path(directory)
+    assert directory.is_dir(), f"{directory=} should be a directory"
+    rows = []
+    for p in directory.glob("*.pt"):
+        full_kwargs = {}
+        for kv in p.stem.split("___"):
+            k, v = kv.split("=")
+            full_kwargs[k] = v
+        rows.append(
+            {
+                "filename": str(p.name),
+                **full_kwargs,
+            }
+        )
+    df = pl.DataFrame(rows)
+    df = df.with_columns(
+        pl.col("forward_pass_id").cast(int),
+        pl.col("rank").cast(int),
+    )
+    return df
+def check_tensor_pair(path_baseline, path_target):
+    x_baseline = torch.load(path_baseline, weights_only=True)
+    x_target = torch.load(path_target, weights_only=True)
+    print(
+        f"[shape] {x_baseline.shape} vs {x_target.shape}\t"
+        f"[dtype] {x_baseline.dtype} vs {x_target.dtype}"
+    )
+    if x_baseline.shape != x_target.shape:
+        print(f"❌ Shape mismatch")
+        return
+    raw_abs_diff = (x_target - x_baseline).abs()
+    max_abs_diff = raw_abs_diff.max().item()
+    mean_abs_diff = raw_abs_diff.mean().item()
+    rel_diff = _calc_rel_diff(x_target, x_baseline)
+    needs_print = max_abs_diff > 1e-3
+    print(
+        "\t".join(
+            f"{'❌' if value > 1e-3 else '✅'} {name}={value}"
+            for name, value in [
+                ("rel_diff", rel_diff),
+                ("max_abs_diff", max_abs_diff),
+                ("mean_abs_diff", mean_abs_diff),
+            ]
+        )
+    )
+    if needs_print:
+        print(f"x_baseline(sample)={get_truncated_value(x_baseline)}")
+        print(f"x_target(sample)={get_truncated_value(x_target)}")
+# Copied from DeepGEMM
+def _calc_rel_diff(x: torch.Tensor, y: torch.Tensor):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--baseline-path", type=str)
+    parser.add_argument("--target-path", type=str)
+    parser.add_argument("--start-id", type=int, default=0)
+    parser.add_argument("--end-id", type=int, default=1000000)
+    parser.add_argument("--baseline-start-id", type=int, default=0)
+    args = parser.parse_args()
+    main(args)

sglang/srt/debug_utils/dumper.py ADDED Viewed

@@ -0,0 +1,108 @@
+import os
+import time
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.distributed as dist
+class _Dumper:
+    """Utility to dump tensors, which can be useful when comparison checking models.
+    Example usage:
+    dumper.on_forward_pass_start()
+    dumper.dump("layer_start__hidden_states", hidden_states, layer_id=self.layer_id)
+    Import from non-SGLang system:
+    ```
+    import sys
+    sys.path.append("/YOUR_PATH/sglang/python/sglang/srt/debug_utils")
+    from dumper import dumper
+    ```
+    Related: `sglang.srt.debug_utils.dump_comparator` for dump comparison
+    """
+    def __init__(self):
+        # Do not import `sglang` to make this file standalone
+        self._enable = bool(int(os.environ.get("SGLANG_DUMPER_ENABLE", "1")))
+        self._base_dir = Path(os.environ.get("SGLANG_DUMPER_DIR", "/tmp"))
+        self._enable_write_file = bool(
+            int(os.environ.get("SGLANG_DUMPER_WRITE_FILE", "1"))
+        )
+        self._partial_name: Optional[str] = None
+        self._dump_index = 0
+        self._forward_pass_id = 0
+    def on_forward_pass_start(self):
+        self._forward_pass_id += 1
+        print(
+            f"[Dumper] [{time.time()}] on_forward_pass_start id={self._forward_pass_id}"
+        )
+    def dump(self, name, value, **kwargs):
+        if not self._enable:
+            return
+        assert (
+            self._forward_pass_id >= 1
+        ), "Do you forget to call `dumper.on_forward_pass_start()`?"
+        self._dump_index += 1
+        if self._partial_name is None:
+            self._partial_name = _get_partial_name()
+        rank = dist.get_rank()
+        full_kwargs = dict(
+            forward_pass_id=self._forward_pass_id,
+            rank=rank,
+            name=name,
+            dump_index=self._dump_index,
+            **kwargs,
+        )
+        full_filename = "___".join(f"{k}={v}" for k, v in full_kwargs.items()) + ".pt"
+        path = self._base_dir / f"sglang_dump_{self._partial_name}" / full_filename
+        sample_value = get_truncated_value(value)
+        print(
+            f"[Dumper] [{rank}, {time.time()}] {path} "
+            f"type={type(value)} "
+            f"shape={value.shape if isinstance(value, torch.Tensor) else None} "
+            f"dtype={value.dtype if isinstance(value, torch.Tensor) else None} "
+            f"sample_value={sample_value}"
+        )
+        if self._enable_write_file:
+            path.parent.mkdir(parents=True, exist_ok=True)
+            torch.save(value, str(path))
+def _get_partial_name():
+    rank = dist.get_rank()
+    object_list = [str(time.time()) if rank == 0 else None]
+    dist.broadcast_object_list(object_list, device="cuda")
+    return object_list[0]
+def get_truncated_value(value):
+    if value is None:
+        return None
+    if isinstance(value, tuple):
+        return [get_truncated_value(x) for x in value]
+    if not isinstance(value, torch.Tensor):
+        return None
+    if value.numel() < 200:
+        return value
+    slices = [
+        slice(0, 5) if dim_size > 200 else slice(None) for dim_size in value.shape
+    ]
+    return value[tuple(slices)]
+dumper = _Dumper()

sglang/srt/debug_utils/text_comparator.py ADDED Viewed

@@ -0,0 +1,172 @@
+import argparse
+import json
+from pathlib import Path
+import polars as pl
+_DESCRIPTION = """Compare and find differences to benchmark outputs.
+Supported inputs:
+* The samples jsonl from `lm_eval --log_samples --output_path FOLDER_NAME`
+* The output from `gsm8k/bench_sglang.py --raw-result-file FILE_NAME` (or mmlu)
+"""
+def main(args):
+    df_input = _transform_df_input(_compute_df_raw(args))
+    assert all(
+        c in df_input.columns
+        for c in ["category", "trial_index", "prompt_id", "prompt", "output", "correct"]
+    )
+    df_meta = _compute_df_meta(df_input)
+    df_correctness_per_trial = df_input.group_by(
+        "category", "trial_index", maintain_order=True
+    ).agg(pl.col("correct").mean())
+    df_correctness_delta = (
+        df_meta.group_by("correctness_delta").len().sort("correctness_delta")
+    )
+    df_good_to_bad = df_meta.filter(pl.col("correctness_delta") < 0)
+    df_bad_to_good = df_meta.filter(pl.col("correctness_delta") > 0)
+    print(f"Dump output to {args.output_path}")
+    Path(args.output_path).write_text(
+        json.dumps(
+            dict(
+                df_meta=df_meta.to_dicts(),
+                df_good_to_bad=df_good_to_bad.to_dicts(),
+                df_bad_to_good=df_bad_to_good.to_dicts(),
+            )
+        )
+    )
+    if not args.disable_print_details:
+        with pl.Config(
+            fmt_str_lengths=10000,
+            tbl_cols=-1,
+            tbl_rows=-1,
+            tbl_width_chars=-1,
+            tbl_formatting="UTF8_FULL",
+        ):
+            print("====== Correctness per trial ======")
+            print(df_correctness_per_trial)
+            print(
+                "====== Correctness Delta (-1.0 means all-right becomes all-wrong) ======"
+            )
+            print(df_correctness_delta)
+            for name, df in [
+                ("Good->Bad", df_good_to_bad),
+                ("Bad->Good", df_bad_to_good),
+            ]:
+                print(f"====== Concrete Examples: {name} ======")
+                print(df)
+def _compute_df_raw(args):
+    return pl.concat(
+        [
+            _read_df_raw(p, category=category, trial_index=i)
+            for category, paths in [
+                ("baseline", args.baseline_path),
+                ("target", args.target_path),
+            ]
+            for i, p in enumerate(paths)
+        ]
+    )
+def _read_df_raw(path: str, category: str, trial_index: int):
+    return pl.read_ndjson(path).with_columns(
+        category=pl.lit(category), trial_index=trial_index
+    )
+def _transform_df_input(df: pl.DataFrame):
+    if "doc_id" in df.columns:
+        print("Transform mode: lm_eval")
+        filter_names = df["filter"].unique(maintain_order=True).to_list()
+        if len(filter_names) > 1:
+            filter_name = filter_names[0]
+            print(f"Choose {filter_name=} among {filter_names}")
+            df = df.filter(pl.col("filter") == filter_name)
+        df = df.select(
+            pl.col("category"),
+            pl.col("trial_index"),
+            prompt_id=pl.col("doc_id"),
+            prompt=pl.col("arguments").struct.field("gen_args_0").struct.field("arg_0"),
+            output=pl.col("resps").list.get(0).list.get(0),
+            correct=pl.col("exact_match").cast(bool),
+        )
+        return df
+    elif "prompt_id" in df.columns:
+        print("Transform mode: SGLang bench")
+        return df
+    else:
+        raise Exception(f"Unknown data: {df.columns}")
+def _compute_df_meta(df_input: pl.DataFrame):
+    df_input = df_input.sort("prompt_id", "category", "trial_index")
+    df_meta = pl.DataFrame(
+        [
+            _handle_one_prompt(df_one_prompt)
+            for df_one_prompt in df_input.partition_by("prompt_id", maintain_order=True)
+        ]
+    )
+    df_meta = df_meta.with_columns(
+        correctness_delta=pl.col("correctness_target") - pl.col("correctness_baseline"),
+    )
+    df_meta = df_meta.sort("correctness_delta", "output_same_prefix_len")
+    return df_meta
+def _handle_one_prompt(df_one_prompt: pl.DataFrame):
+    assert len(set(df_one_prompt["prompt"])) == 1
+    df_baseline = df_one_prompt.filter(pl.col("category") == "baseline")
+    df_target = df_one_prompt.filter(pl.col("category") == "target")
+    outputs_baseline = df_baseline["output"].to_list()
+    outputs_target = df_target["output"].to_list()
+    output_same_prefix_len = max(
+        _compute_str_prefix_len(output_baseline, output_target)
+        for output_baseline in outputs_baseline
+        for output_target in outputs_target
+    )
+    return dict(
+        prompt_id=df_one_prompt[0, "prompt_id"],
+        correctness_baseline=df_baseline["correct"].mean(),
+        correctness_target=df_target["correct"].mean(),
+        output_same_prefix_len=output_same_prefix_len,
+        prompt=df_one_prompt[0, "prompt"],
+        outputs_baseline=outputs_baseline,
+        outputs_target=outputs_target,
+    )
+def _compute_str_prefix_len(a: str, b: str) -> int:
+    min_len = min(len(a), len(b))
+    for i in range(min_len):
+        if a[i] != b[i]:
+            return i
+    return min_len
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=_DESCRIPTION)
+    parser.add_argument("--baseline-path", type=str, nargs="+")
+    parser.add_argument("--target-path", type=str, nargs="+")
+    parser.add_argument(
+        "--output-path", type=str, default="/tmp/text_comparator_output.json"
+    )
+    parser.add_argument("--disable-print-details", action="store_true")
+    args = parser.parse_args()
+    main(args)

sglang/srt/disaggregation/decode_schedule_batch_mixin.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from __future__ import annotations
 import logging
+from http import HTTPStatus
 from typing import TYPE_CHECKING
 import torch
+from sglang.srt.disaggregation.utils import prepare_abort
 from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
@@ -102,7 +104,17 @@ class ScheduleBatchDisaggregationDecodeMixin:
             self.output_ids.append(req.output_ids[-1])
             self.tree_cache.cache_unfinished_req(req)
             if req.grammar is not None:
-                req.grammar.accept_token(req.output_ids[-1])
+                # FIXME: this try-except block is for handling unexpected xgrammar issue.
+                try:
+                    req.grammar.accept_token(req.output_ids[-1])
+                except ValueError as e:
+                    # Grammar accept_token can raise ValueError if the token is not in the grammar.
+                    # This can happen if the grammar is not set correctly or the token is invalid.
+                    error_message = f"Grammar accept_token failed for req {req.rid} with token {req.output_ids[-1]}: {e}"
+                    self.tree_cache.cache_finished_req(req)
+                    prepare_abort(
+                        req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
+                    )
                 req.grammar.finished = req.finished()
         self.output_ids = torch.tensor(self.output_ids, device=self.device)

sglang/srt/disaggregation/mooncake/conn.py CHANGED Viewed

@@ -992,6 +992,14 @@ class MooncakeKVSender(BaseKVSender):
             )
         raise KVTransferError(self.bootstrap_room, failure_reason)
+    def abort(self):
+        self.kv_mgr.record_failure(
+            self.bootstrap_room,
+            "Aborted by AbortReq.",
+        )
+        # Explicitly set the status to failure since this request has been aborted
+        self.conclude_state = KVPoll.Failed
 class MooncakeKVReceiver(BaseKVReceiver):
     _ctx = zmq.Context()
@@ -1305,6 +1313,14 @@ class MooncakeKVReceiver(BaseKVReceiver):
             )
         raise KVTransferError(self.bootstrap_room, failure_reason)
+    def abort(self):
+        self.kv_mgr.record_failure(
+            self.bootstrap_room,
+            "Aborted by AbortReq.",
+        )
+        # Explicitly set the status to failure since this request has been aborted
+        self.conclude_state = KVPoll.Failed
 class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
     def __init__(self, port: int):

sglang/srt/disaggregation/prefill.py CHANGED Viewed

@@ -425,7 +425,19 @@ class SchedulerDisaggregationPrefillMixin:
                 self.send_kv_chunk(req, last_chunk=True)
                 if req.grammar is not None:
-                    req.grammar.accept_token(next_token_id)
+                    # FIXME: this try-except block is for handling unexpected xgrammar issue.
+                    try:
+                        req.grammar.accept_token(next_token_id)
+                    except ValueError as e:
+                        # Grammar accept_token can raise ValueError if the token is not in the grammar.
+                        # This can happen if the grammar is not set correctly or the token is invalid.
+                        error_message = f"Grammar accept_token failed for req {req.rid} with token {next_token_id}: {e}"
+                        self.tree_cache.cache_finished_req(req)
+                        prepare_abort(
+                            req,
+                            error_message,
+                            status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
+                        )
                     req.grammar.finished = req.finished()
             else:
                 # being chunked reqs' prefill is not finished

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -640,7 +640,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if server_args.attention_backend == "flashinfer":
         assert_pkg_version(
             "flashinfer_python",
-            "0.2.9rc1",
+            "0.2.9rc2",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
@@ -765,7 +765,9 @@ def _launch_subprocesses(
             # When using `Engine` as a Python API, we don't want to block here.
             return None, None, None
-        launch_dummy_health_check_server(server_args.host, server_args.port)
+        launch_dummy_health_check_server(
+            server_args.host, server_args.port, server_args.enable_metrics
+        )
         for proc in scheduler_procs:
             proc.join()

sglang 0.4.9.post4__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl

sglang 0.4.9.post4py3-none-any.whl → 0.4.9.post5py3-none-any.whl