PyPI - sglang - Versions diffs - 0.4.9.post3__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl - Mend

sglang 0.4.9.post3py3-none-any.whl → 0.4.9.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (128) hide show

sglang/lang/chat_template.py +21 -0
sglang/srt/_custom_ops.py +29 -1
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/model_config.py +5 -1
sglang/srt/constrained/base_grammar_backend.py +10 -2
sglang/srt/constrained/xgrammar_backend.py +7 -5
sglang/srt/conversation.py +17 -2
sglang/srt/debug_utils/__init__.py +0 -0
sglang/srt/debug_utils/dump_comparator.py +131 -0
sglang/srt/debug_utils/dumper.py +108 -0
sglang/srt/debug_utils/text_comparator.py +172 -0
sglang/srt/disaggregation/common/conn.py +34 -6
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
sglang/srt/disaggregation/mini_lb.py +3 -2
sglang/srt/disaggregation/mooncake/conn.py +65 -20
sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
sglang/srt/disaggregation/nixl/conn.py +17 -13
sglang/srt/disaggregation/prefill.py +13 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
sglang/srt/distributed/parallel_state.py +70 -15
sglang/srt/entrypoints/engine.py +5 -9
sglang/srt/entrypoints/http_server.py +20 -32
sglang/srt/entrypoints/openai/protocol.py +3 -3
sglang/srt/entrypoints/openai/serving_chat.py +148 -72
sglang/srt/function_call/base_format_detector.py +74 -12
sglang/srt/function_call/deepseekv3_detector.py +26 -11
sglang/srt/function_call/ebnf_composer.py +105 -66
sglang/srt/function_call/function_call_parser.py +6 -4
sglang/srt/function_call/glm4_moe_detector.py +164 -0
sglang/srt/function_call/kimik2_detector.py +41 -16
sglang/srt/function_call/llama32_detector.py +6 -3
sglang/srt/function_call/mistral_detector.py +11 -3
sglang/srt/function_call/pythonic_detector.py +16 -14
sglang/srt/function_call/qwen25_detector.py +12 -3
sglang/srt/function_call/{qwen3_detector.py → qwen3_coder_detector.py} +11 -9
sglang/srt/layers/activation.py +11 -3
sglang/srt/layers/attention/base_attn_backend.py +3 -1
sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
sglang/srt/layers/attention/vision.py +56 -8
sglang/srt/layers/communicator.py +12 -12
sglang/srt/layers/dp_attention.py +72 -24
sglang/srt/layers/layernorm.py +26 -1
sglang/srt/layers/logits_processor.py +46 -25
sglang/srt/layers/moe/ep_moe/layer.py +172 -206
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +25 -224
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
sglang/srt/layers/moe/topk.py +88 -34
sglang/srt/layers/multimodal.py +11 -8
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -9
sglang/srt/layers/quantization/fp8.py +25 -247
sglang/srt/layers/quantization/fp8_kernel.py +78 -48
sglang/srt/layers/quantization/modelopt_quant.py +33 -14
sglang/srt/layers/quantization/unquant.py +24 -76
sglang/srt/layers/quantization/utils.py +0 -9
sglang/srt/layers/quantization/w4afp8.py +68 -17
sglang/srt/layers/radix_attention.py +5 -3
sglang/srt/lora/lora_manager.py +133 -169
sglang/srt/lora/lora_registry.py +188 -0
sglang/srt/lora/mem_pool.py +2 -2
sglang/srt/managers/cache_controller.py +62 -13
sglang/srt/managers/io_struct.py +19 -1
sglang/srt/managers/mm_utils.py +154 -35
sglang/srt/managers/multimodal_processor.py +3 -14
sglang/srt/managers/schedule_batch.py +27 -11
sglang/srt/managers/scheduler.py +48 -26
sglang/srt/managers/tokenizer_manager.py +62 -28
sglang/srt/managers/tp_worker.py +5 -4
sglang/srt/mem_cache/allocator.py +67 -7
sglang/srt/mem_cache/hicache_storage.py +17 -1
sglang/srt/mem_cache/hiradix_cache.py +35 -18
sglang/srt/mem_cache/memory_pool_host.py +3 -0
sglang/srt/model_executor/cuda_graph_runner.py +61 -25
sglang/srt/model_executor/forward_batch_info.py +201 -29
sglang/srt/model_executor/model_runner.py +109 -37
sglang/srt/models/deepseek_v2.py +63 -30
sglang/srt/models/glm4_moe.py +1035 -0
sglang/srt/models/glm4_moe_nextn.py +167 -0
sglang/srt/models/interns1.py +328 -0
sglang/srt/models/internvl.py +143 -47
sglang/srt/models/llava.py +9 -5
sglang/srt/models/minicpmo.py +4 -1
sglang/srt/models/mllama4.py +10 -3
sglang/srt/models/qwen2_moe.py +2 -6
sglang/srt/models/qwen3_moe.py +6 -8
sglang/srt/multimodal/processors/base_processor.py +20 -6
sglang/srt/multimodal/processors/clip.py +2 -2
sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
sglang/srt/multimodal/processors/gemma3.py +2 -2
sglang/srt/multimodal/processors/gemma3n.py +2 -2
sglang/srt/multimodal/processors/internvl.py +21 -8
sglang/srt/multimodal/processors/janus_pro.py +2 -2
sglang/srt/multimodal/processors/kimi_vl.py +2 -2
sglang/srt/multimodal/processors/llava.py +4 -4
sglang/srt/multimodal/processors/minicpm.py +2 -3
sglang/srt/multimodal/processors/mlama.py +2 -2
sglang/srt/multimodal/processors/mllama4.py +18 -111
sglang/srt/multimodal/processors/phi4mm.py +2 -2
sglang/srt/multimodal/processors/pixtral.py +2 -2
sglang/srt/multimodal/processors/qwen_audio.py +2 -2
sglang/srt/multimodal/processors/qwen_vl.py +2 -2
sglang/srt/multimodal/processors/vila.py +3 -1
sglang/srt/reasoning_parser.py +48 -5
sglang/srt/sampling/sampling_batch_info.py +6 -5
sglang/srt/server_args.py +132 -60
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +37 -36
sglang/srt/speculative/eagle_utils.py +51 -23
sglang/srt/speculative/eagle_worker.py +59 -44
sglang/srt/two_batch_overlap.py +9 -5
sglang/srt/utils.py +113 -69
sglang/srt/weight_sync/utils.py +119 -0
sglang/test/runners.py +4 -0
sglang/test/test_activation.py +50 -1
sglang/test/test_utils.py +65 -5
sglang/utils.py +19 -0
sglang/version.py +1 -1
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +6 -6
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +127 -114
sglang/srt/debug_utils.py +0 -74
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0

sglang/lang/chat_template.py CHANGED Viewed

@@ -448,6 +448,19 @@ register_chat_template(
     )
 )
+register_chat_template(
+    ChatTemplate(
+        name="interns1",
+        default_system_prompt="You are an AI assistant whose name is Intern-S1 (书生大模型).\n- Intern-S1 (书生大模型) is a vision-language model that is developed by Shanghai AI Laboratory (上海人工智能实验室).  It is designed to be helpful, honest, and harmless.\n- Intern-S1 (书生大模型) can understand and communicate fluently in the language chosen by the user such as English and 中文.\nYou are an expert reasoner with extensive experience in all areas. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        stop_str=["<|im_end|>", "<|action_end|>"],
+    )
+)
 register_chat_template(
     ChatTemplate(
         name="granite-3-instruct",
@@ -609,6 +622,14 @@ def match_internvl_chat(model_path: str):
         return "internvl-2-5"
+@register_chat_template_matching_function
+def match_interns1_chat(model_path: str):
+    if re.search(r"intern-s1", model_path, re.IGNORECASE):
+        return "interns1"
+    if re.search(r"interns1", model_path, re.IGNORECASE):
+        return "interns1"
 if __name__ == "__main__":
     messages = [
         {"role": "system", "content": None},  # None means default

sglang/srt/_custom_ops.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/_custom_ops.py
 import logging
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 import torch
@@ -114,6 +114,34 @@ else:
     def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
         return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
+    # ROCM custom quick allreduce
+    def init_custom_qr(
+        rank: int, world_size: int, qr_max_size: Optional[int] = None
+    ) -> int:
+        return sgl_kernel.allreduce.init_custom_qr(world_size, rank, qr_max_size)
+    def qr_get_handle(fa: int) -> torch.Tensor:
+        return sgl_kernel.allreduce.qr_get_handle(fa)
+    def qr_open_handles(fa: int, handles: list[torch.Tensor]) -> None:
+        sgl_kernel.allreduce.qr_open_handles(fa, handles)
+    def qr_all_reduce(
+        fa: int,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        quant_level: int,
+        cast_bf2half: bool,
+    ) -> None:
+        sgl_kernel.allreduce.qr_all_reduce(fa, inp, out, quant_level, cast_bf2half)
+    def qr_destroy(fa: int) -> None:
+        sgl_kernel.allreduce.qr_destroy(fa)
+    def qr_max_size() -> int:
+        return sgl_kernel.allreduce.qr_max_size()
 def mscclpp_generate_unique_id() -> bytes:
     return sgl_kernel.allreduce.mscclpp_generate_unique_id()

sglang/srt/configs/internvl.py CHANGED Viewed

@@ -10,6 +10,7 @@ from transformers import (
     PretrainedConfig,
     PreTrainedTokenizer,
     Qwen2Config,
+    Qwen3Config,
 )
 from sglang.utils import logger
@@ -314,6 +315,8 @@ class InternVLChatConfig(PretrainedConfig):
             self.llm_config = InternLM2Config(**llm_config)
         elif llm_config.get("architectures")[0] == "Qwen2ForCausalLM":
             self.llm_config = Qwen2Config(**llm_config)
+        elif llm_config.get("architectures")[0] == "Qwen3MoeForCausalLM":
+            self.llm_config = Qwen3Config(**llm_config)
         else:
             raise ValueError(
                 "Unsupported architecture: {}".format(

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -127,6 +127,9 @@ class ModelConfig:
         ):
             self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
+        if is_draft_model and self.hf_config.architectures[0] == "Glm4MoeForCausalLM":
+            self.hf_config.architectures[0] = "Glm4MoeForCausalLMNextN"
         if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
             self.hf_config.architectures[0] = "MiMoMTP"
         # Check model type
@@ -475,7 +478,7 @@ class ModelConfig:
     def get_hf_eos_token_id(self) -> Optional[Set[int]]:
         eos_ids = getattr(self.hf_config, "eos_token_id", None)
-        if eos_ids:
+        if eos_ids is not None:
             # it can be either int or list of int
             eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
         if eos_ids is None:
@@ -635,6 +638,7 @@ multimodal_model_archs = [
     "Qwen2_5_VLForConditionalGeneration",
     "KimiVLForConditionalGeneration",
     "InternVLChatModel",
+    "InternS1ForConditionalGeneration",
     "Phi4MMForCausalLM",
     "VILAForConditionalGeneration",
 ]

sglang/srt/constrained/base_grammar_backend.py CHANGED Viewed

@@ -168,7 +168,10 @@ class BaseGrammarBackend:
 def create_grammar_backend(
-    server_args: ServerArgs, tokenizer, vocab_size: int
+    server_args: ServerArgs,
+    tokenizer,
+    vocab_size: int,
+    eos_token_ids: Optional[set] = None,
 ) -> Optional[BaseGrammarBackend]:
     if server_args.grammar_backend == "outlines":
         from sglang.srt.constrained.outlines_backend import OutlinesGrammarBackend
@@ -180,7 +183,12 @@ def create_grammar_backend(
     elif server_args.grammar_backend == "xgrammar":
         from sglang.srt.constrained.xgrammar_backend import XGrammarGrammarBackend
-        grammar_backend = XGrammarGrammarBackend(tokenizer, vocab_size=vocab_size)
+        # Convert Set[int] to List[int] if needed
+        eos_list = list(eos_token_ids) if eos_token_ids else None
+        grammar_backend = XGrammarGrammarBackend(
+            tokenizer, vocab_size=vocab_size, model_eos_token_ids=eos_list
+        )
     elif server_args.grammar_backend == "llguidance":
         from sglang.srt.constrained.llguidance_backend import GuidanceBackend

sglang/srt/constrained/xgrammar_backend.py CHANGED Viewed

@@ -150,14 +150,16 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
         self,
         tokenizer,
         vocab_size: int,
+        model_eos_token_ids: Optional[List[int]] = None,
     ):
         super().__init__()
-        if True:
-            tokenizer_info = TokenizerInfo.from_huggingface(
-                tokenizer, vocab_size=vocab_size
-            )
-            override_stop_tokens = None
+        # Create TokenizerInfo with model's EOS tokens as the authoritative stop tokens
+        # This ensures consistency between what the model considers EOS and what XGrammar uses
+        tokenizer_info = TokenizerInfo.from_huggingface(
+            tokenizer, vocab_size=vocab_size, stop_token_ids=model_eos_token_ids
+        )
+        override_stop_tokens = None
         self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
         self.vocab_size = vocab_size

sglang/srt/conversation.py CHANGED Viewed

@@ -623,7 +623,7 @@ def generate_chat_conv(
                         real_content += content.text
                     elif content.type == "image_url":
                         # NOTE: works for llava and intervl2_5
-                        if conv.name == "internvl-2-5":
+                        if conv.name in ["internvl-2-5", "interns1"]:
                             real_content = image_token + real_content
                         else:
                             real_content += image_token
@@ -817,6 +817,19 @@ register_conv_template(
     )
 )
+register_conv_template(
+    Conversation(
+        name="interns1",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are an AI assistant whose name is Intern-S1 (书生大模型).\n- Intern-S1 (书生大模型) is a vision-language model that is developed by Shanghai AI Laboratory (上海人工智能实验室).  It is designed to be helpful, honest, and harmless.\n- Intern-S1 (书生大模型) can understand and communicate fluently in the language chosen by the user such as English and 中文.\nYou are an expert reasoner with extensive experience in all areas. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.",
+        roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+        sep_style=SeparatorStyle.MPT,
+        sep="<|im_end|>\n",
+        stop_str=["<|im_end|>", "<|action_end|>"],
+        image_token="<image>",
+    )
+)
 # Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
 register_conv_template(
     Conversation(
@@ -984,8 +997,10 @@ register_conv_template(
 @register_conv_template_matching_function
 def match_internvl(model_path: str):
-    if re.search(r"internvl2_5", model_path, re.IGNORECASE):
+    if re.search(r"internvl", model_path, re.IGNORECASE):
         return "internvl-2-5"
+    if re.search(r"intern.*s1", model_path, re.IGNORECASE):
+        return "interns1"
 @register_conv_template_matching_function

sglang/srt/debug_utils/__init__.py ADDED Viewed

File without changes

sglang/srt/debug_utils/dump_comparator.py ADDED Viewed

@@ -0,0 +1,131 @@
+import argparse
+import functools
+import re
+from pathlib import Path
+import polars as pl
+import torch
+from sglang.srt.debug_utils.dumper import get_truncated_value
+def main(args):
+    df_target = read_meta(args.target_path)
+    df_target = df_target.sort("rank", "dump_index")
+    df_target = df_target.filter(
+        (pl.col("forward_pass_id") >= args.start_id)
+        & (pl.col("forward_pass_id") <= args.end_id)
+    )
+    assert all(
+        c in df_target.columns
+        for c in ["rank", "forward_pass_id", "dump_index", "name"]
+    )
+    df_baseline = read_meta(args.baseline_path)
+    print("df_target", df_target)
+    print("df_baseline", df_baseline)
+    for row in df_target.iter_rows(named=True):
+        rows_baseline = df_baseline.filter(
+            (
+                pl.col("forward_pass_id")
+                == row["forward_pass_id"] - args.start_id + args.baseline_start_id
+            )
+            & functools.reduce(
+                lambda a, b: a & b,
+                [
+                    pl.col(col) == row[col]
+                    for col in row.keys()
+                    if col not in ["forward_pass_id", "dump_index", "filename"]
+                ],
+            )
+        )
+        assert len(rows_baseline) == 1, f"{rows_baseline=}"
+        row_baseline = rows_baseline.to_dicts()[0]
+        path_baseline = Path(args.baseline_path) / row_baseline["filename"]
+        path_target = Path(args.target_path) / row["filename"]
+        print(f"Check: target={str(path_target)} baseline={str(path_baseline)}")
+        check_tensor_pair(path_baseline=path_baseline, path_target=path_target)
+        print()
+def read_meta(directory):
+    directory = Path(directory)
+    assert directory.is_dir(), f"{directory=} should be a directory"
+    rows = []
+    for p in directory.glob("*.pt"):
+        full_kwargs = {}
+        for kv in p.stem.split("___"):
+            k, v = kv.split("=")
+            full_kwargs[k] = v
+        rows.append(
+            {
+                "filename": str(p.name),
+                **full_kwargs,
+            }
+        )
+    df = pl.DataFrame(rows)
+    df = df.with_columns(
+        pl.col("forward_pass_id").cast(int),
+        pl.col("rank").cast(int),
+    )
+    return df
+def check_tensor_pair(path_baseline, path_target):
+    x_baseline = torch.load(path_baseline, weights_only=True)
+    x_target = torch.load(path_target, weights_only=True)
+    print(
+        f"[shape] {x_baseline.shape} vs {x_target.shape}\t"
+        f"[dtype] {x_baseline.dtype} vs {x_target.dtype}"
+    )
+    if x_baseline.shape != x_target.shape:
+        print(f"❌ Shape mismatch")
+        return
+    raw_abs_diff = (x_target - x_baseline).abs()
+    max_abs_diff = raw_abs_diff.max().item()
+    mean_abs_diff = raw_abs_diff.mean().item()
+    rel_diff = _calc_rel_diff(x_target, x_baseline)
+    needs_print = max_abs_diff > 1e-3
+    print(
+        "\t".join(
+            f"{'❌' if value > 1e-3 else '✅'} {name}={value}"
+            for name, value in [
+                ("rel_diff", rel_diff),
+                ("max_abs_diff", max_abs_diff),
+                ("mean_abs_diff", mean_abs_diff),
+            ]
+        )
+    )
+    if needs_print:
+        print(f"x_baseline(sample)={get_truncated_value(x_baseline)}")
+        print(f"x_target(sample)={get_truncated_value(x_target)}")
+# Copied from DeepGEMM
+def _calc_rel_diff(x: torch.Tensor, y: torch.Tensor):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--baseline-path", type=str)
+    parser.add_argument("--target-path", type=str)
+    parser.add_argument("--start-id", type=int, default=0)
+    parser.add_argument("--end-id", type=int, default=1000000)
+    parser.add_argument("--baseline-start-id", type=int, default=0)
+    args = parser.parse_args()
+    main(args)

sglang/srt/debug_utils/dumper.py ADDED Viewed

@@ -0,0 +1,108 @@
+import os
+import time
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.distributed as dist
+class _Dumper:
+    """Utility to dump tensors, which can be useful when comparison checking models.
+    Example usage:
+    dumper.on_forward_pass_start()
+    dumper.dump("layer_start__hidden_states", hidden_states, layer_id=self.layer_id)
+    Import from non-SGLang system:
+    ```
+    import sys
+    sys.path.append("/YOUR_PATH/sglang/python/sglang/srt/debug_utils")
+    from dumper import dumper
+    ```
+    Related: `sglang.srt.debug_utils.dump_comparator` for dump comparison
+    """
+    def __init__(self):
+        # Do not import `sglang` to make this file standalone
+        self._enable = bool(int(os.environ.get("SGLANG_DUMPER_ENABLE", "1")))
+        self._base_dir = Path(os.environ.get("SGLANG_DUMPER_DIR", "/tmp"))
+        self._enable_write_file = bool(
+            int(os.environ.get("SGLANG_DUMPER_WRITE_FILE", "1"))
+        )
+        self._partial_name: Optional[str] = None
+        self._dump_index = 0
+        self._forward_pass_id = 0
+    def on_forward_pass_start(self):
+        self._forward_pass_id += 1
+        print(
+            f"[Dumper] [{time.time()}] on_forward_pass_start id={self._forward_pass_id}"
+        )
+    def dump(self, name, value, **kwargs):
+        if not self._enable:
+            return
+        assert (
+            self._forward_pass_id >= 1
+        ), "Do you forget to call `dumper.on_forward_pass_start()`?"
+        self._dump_index += 1
+        if self._partial_name is None:
+            self._partial_name = _get_partial_name()
+        rank = dist.get_rank()
+        full_kwargs = dict(
+            forward_pass_id=self._forward_pass_id,
+            rank=rank,
+            name=name,
+            dump_index=self._dump_index,
+            **kwargs,
+        )
+        full_filename = "___".join(f"{k}={v}" for k, v in full_kwargs.items()) + ".pt"
+        path = self._base_dir / f"sglang_dump_{self._partial_name}" / full_filename
+        sample_value = get_truncated_value(value)
+        print(
+            f"[Dumper] [{rank}, {time.time()}] {path} "
+            f"type={type(value)} "
+            f"shape={value.shape if isinstance(value, torch.Tensor) else None} "
+            f"dtype={value.dtype if isinstance(value, torch.Tensor) else None} "
+            f"sample_value={sample_value}"
+        )
+        if self._enable_write_file:
+            path.parent.mkdir(parents=True, exist_ok=True)
+            torch.save(value, str(path))
+def _get_partial_name():
+    rank = dist.get_rank()
+    object_list = [str(time.time()) if rank == 0 else None]
+    dist.broadcast_object_list(object_list, device="cuda")
+    return object_list[0]
+def get_truncated_value(value):
+    if value is None:
+        return None
+    if isinstance(value, tuple):
+        return [get_truncated_value(x) for x in value]
+    if not isinstance(value, torch.Tensor):
+        return None
+    if value.numel() < 200:
+        return value
+    slices = [
+        slice(0, 5) if dim_size > 200 else slice(None) for dim_size in value.shape
+    ]
+    return value[tuple(slices)]
+dumper = _Dumper()

sglang/srt/debug_utils/text_comparator.py ADDED Viewed

@@ -0,0 +1,172 @@
+import argparse
+import json
+from pathlib import Path
+import polars as pl
+_DESCRIPTION = """Compare and find differences to benchmark outputs.
+Supported inputs:
+* The samples jsonl from `lm_eval --log_samples --output_path FOLDER_NAME`
+* The output from `gsm8k/bench_sglang.py --raw-result-file FILE_NAME` (or mmlu)
+"""
+def main(args):
+    df_input = _transform_df_input(_compute_df_raw(args))
+    assert all(
+        c in df_input.columns
+        for c in ["category", "trial_index", "prompt_id", "prompt", "output", "correct"]
+    )
+    df_meta = _compute_df_meta(df_input)
+    df_correctness_per_trial = df_input.group_by(
+        "category", "trial_index", maintain_order=True
+    ).agg(pl.col("correct").mean())
+    df_correctness_delta = (
+        df_meta.group_by("correctness_delta").len().sort("correctness_delta")
+    )
+    df_good_to_bad = df_meta.filter(pl.col("correctness_delta") < 0)
+    df_bad_to_good = df_meta.filter(pl.col("correctness_delta") > 0)
+    print(f"Dump output to {args.output_path}")
+    Path(args.output_path).write_text(
+        json.dumps(
+            dict(
+                df_meta=df_meta.to_dicts(),
+                df_good_to_bad=df_good_to_bad.to_dicts(),
+                df_bad_to_good=df_bad_to_good.to_dicts(),
+            )
+        )
+    )
+    if not args.disable_print_details:
+        with pl.Config(
+            fmt_str_lengths=10000,
+            tbl_cols=-1,
+            tbl_rows=-1,
+            tbl_width_chars=-1,
+            tbl_formatting="UTF8_FULL",
+        ):
+            print("====== Correctness per trial ======")
+            print(df_correctness_per_trial)
+            print(
+                "====== Correctness Delta (-1.0 means all-right becomes all-wrong) ======"
+            )
+            print(df_correctness_delta)
+            for name, df in [
+                ("Good->Bad", df_good_to_bad),
+                ("Bad->Good", df_bad_to_good),
+            ]:
+                print(f"====== Concrete Examples: {name} ======")
+                print(df)
+def _compute_df_raw(args):
+    return pl.concat(
+        [
+            _read_df_raw(p, category=category, trial_index=i)
+            for category, paths in [
+                ("baseline", args.baseline_path),
+                ("target", args.target_path),
+            ]
+            for i, p in enumerate(paths)
+        ]
+    )
+def _read_df_raw(path: str, category: str, trial_index: int):
+    return pl.read_ndjson(path).with_columns(
+        category=pl.lit(category), trial_index=trial_index
+    )
+def _transform_df_input(df: pl.DataFrame):
+    if "doc_id" in df.columns:
+        print("Transform mode: lm_eval")
+        filter_names = df["filter"].unique(maintain_order=True).to_list()
+        if len(filter_names) > 1:
+            filter_name = filter_names[0]
+            print(f"Choose {filter_name=} among {filter_names}")
+            df = df.filter(pl.col("filter") == filter_name)
+        df = df.select(
+            pl.col("category"),
+            pl.col("trial_index"),
+            prompt_id=pl.col("doc_id"),
+            prompt=pl.col("arguments").struct.field("gen_args_0").struct.field("arg_0"),
+            output=pl.col("resps").list.get(0).list.get(0),
+            correct=pl.col("exact_match").cast(bool),
+        )
+        return df
+    elif "prompt_id" in df.columns:
+        print("Transform mode: SGLang bench")
+        return df
+    else:
+        raise Exception(f"Unknown data: {df.columns}")
+def _compute_df_meta(df_input: pl.DataFrame):
+    df_input = df_input.sort("prompt_id", "category", "trial_index")
+    df_meta = pl.DataFrame(
+        [
+            _handle_one_prompt(df_one_prompt)
+            for df_one_prompt in df_input.partition_by("prompt_id", maintain_order=True)
+        ]
+    )
+    df_meta = df_meta.with_columns(
+        correctness_delta=pl.col("correctness_target") - pl.col("correctness_baseline"),
+    )
+    df_meta = df_meta.sort("correctness_delta", "output_same_prefix_len")
+    return df_meta
+def _handle_one_prompt(df_one_prompt: pl.DataFrame):
+    assert len(set(df_one_prompt["prompt"])) == 1
+    df_baseline = df_one_prompt.filter(pl.col("category") == "baseline")
+    df_target = df_one_prompt.filter(pl.col("category") == "target")
+    outputs_baseline = df_baseline["output"].to_list()
+    outputs_target = df_target["output"].to_list()
+    output_same_prefix_len = max(
+        _compute_str_prefix_len(output_baseline, output_target)
+        for output_baseline in outputs_baseline
+        for output_target in outputs_target
+    )
+    return dict(
+        prompt_id=df_one_prompt[0, "prompt_id"],
+        correctness_baseline=df_baseline["correct"].mean(),
+        correctness_target=df_target["correct"].mean(),
+        output_same_prefix_len=output_same_prefix_len,
+        prompt=df_one_prompt[0, "prompt"],
+        outputs_baseline=outputs_baseline,
+        outputs_target=outputs_target,
+    )
+def _compute_str_prefix_len(a: str, b: str) -> int:
+    min_len = min(len(a), len(b))
+    for i in range(min_len):
+        if a[i] != b[i]:
+            return i
+    return min_len
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=_DESCRIPTION)
+    parser.add_argument("--baseline-path", type=str, nargs="+")
+    parser.add_argument("--target-path", type=str, nargs="+")
+    parser.add_argument(
+        "--output-path", type=str, default="/tmp/text_comparator_output.json"
+    )
+    parser.add_argument("--disable-print-details", action="store_true")
+    args = parser.parse_args()
+    main(args)

sglang 0.4.9.post3__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl

sglang 0.4.9.post3py3-none-any.whl → 0.4.9.post5py3-none-any.whl