PyPI - sglang - Versions diffs - 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +18 -3
sglang/compile_deep_gemm.py +13 -7
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +120 -0
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +25 -2
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -5
sglang/srt/entrypoints/engine.py +13 -5
sglang/srt/entrypoints/http_server.py +22 -3
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +7 -0
sglang/srt/eplb/expert_distribution.py +34 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +7 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +3 -1
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +10 -2
sglang/srt/layers/communicator.py +23 -1
sglang/srt/layers/layernorm.py +16 -2
sglang/srt/layers/logits_processor.py +4 -20
sglang/srt/layers/moe/ep_moe/layer.py +0 -18
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/moe_runner/deep_gemm.py +53 -33
sglang/srt/layers/moe/token_dispatcher/deepep.py +12 -9
sglang/srt/layers/moe/topk.py +31 -6
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +9 -78
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/rotary_embedding.py +117 -45
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +26 -4
sglang/srt/managers/multi_tokenizer_mixin.py +5 -0
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +164 -129
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +154 -59
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/memory_pool.py +171 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +11 -11
sglang/srt/model_executor/model_runner.py +76 -21
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +149 -34
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +0 -1
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +1 -1
sglang/srt/models/qwen3_moe.py +16 -8
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +103 -22
sglang/srt/single_batch_overlap.py +4 -1
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +55 -32
sglang/srt/utils/hf_transformers_utils.py +38 -16
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +10 -24
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +150 -136
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/checkpoint_engine/update.py ADDED Viewed

@@ -0,0 +1,317 @@
+"""
+Usage:
+1) Launch the server with wait-for-initial-weights option in one terminal:
+   python -m sglang.launch_server --model-path /workspace/Qwen/Qwen3-4B/ --tensor-parallel-size 2 --port 19730 --load-format dummy --checkpoint-engine-wait-weights-before-ready --mem-fraction-static 0.7
+2) Torchrun this script in another terminal:
+    torchrun --nproc-per-node 2 update.py --update-method broadcast --checkpoint-path /workspace/Qwen/Qwen3-4B/  --inference-parallel-size 2
+Or use the integrated entry point:
+    python -m sglang.srt.checkpoint_engine.update --update-method broadcast --checkpoint-path /workspace/Qwen/Qwen3-4B/  --inference-parallel-size 2
+"""
+import argparse
+import json
+import os
+import pickle
+import subprocess
+import sys
+import time
+from collections import defaultdict
+from collections.abc import Callable
+from contextlib import contextmanager
+from typing import Literal
+import httpx
+import torch
+import torch.distributed as dist
+from safetensors import safe_open
+try:
+    from checkpoint_engine.ps import ParameterServer
+    from loguru import logger
+except ImportError:
+    # Fallback for when checkpoint_engine is not available
+    ParameterServer = None
+    import logging
+    logger = logging.getLogger(__name__)
+@contextmanager
+def timer(msg: str):
+    start = time.perf_counter()
+    yield
+    end = time.perf_counter()
+    logger.info(f"{msg} duration: {end - start:.2f} seconds")
+def check_sglang_ready(
+    endpoint: str, inference_parallel_size: int, uds: str | None = None
+):
+    rank = int(os.getenv("RANK", 0))
+    if rank != rank // inference_parallel_size * inference_parallel_size:
+        return
+    retry_num = 0
+    transport = None
+    if uds is not None:
+        transport = httpx.HTTPTransport(uds=uds)
+    with httpx.Client(transport=transport) as client:
+        while True:
+            try:
+                response = client.get(f"{endpoint}/ping", timeout=10)
+                response.raise_for_status()
+                break
+            except (httpx.ConnectError, httpx.HTTPStatusError) as e:
+                if retry_num % 10 == 0:
+                    logger.warning(
+                        f"fail to check sglang ready, retry {retry_num} times, error: {e}"
+                    )
+                retry_num += 1
+                time.sleep(0.1)
+def split_checkpoint_files(
+    checkpoint_path: str, rank: int, world_size: int
+) -> list[str]:
+    checkpoint_files = [
+        os.path.join(checkpoint_path, f)
+        for f in filter(
+            lambda x: x.endswith(".safetensors"), os.listdir(checkpoint_path)
+        )
+    ]
+    files_per_rank = (len(checkpoint_files) + world_size - 1) // world_size
+    return checkpoint_files[rank * files_per_rank : (rank + 1) * files_per_rank]
+def split_tensors(
+    checkpoint_path: str, rank: int, world_size: int
+) -> dict[str, torch.Tensor]:
+    index_fn = os.path.join(checkpoint_path, "model.safetensors.index.json")
+    with open(index_fn) as f:
+        weight_map: dict[str, str] = json.load(f)["weight_map"]
+    weights_per_rank = (len(weight_map) + world_size - 1) // world_size
+    fn_tensors: dict[str, list[str]] = defaultdict(list)
+    weight_keys = list(weight_map.items())
+    for name, file in weight_keys[
+        rank * weights_per_rank : (rank + 1) * weights_per_rank
+    ]:
+        fn_tensors[file].append(name)
+    named_tensors = {}
+    for file, names in fn_tensors.items():
+        with safe_open(os.path.join(checkpoint_path, file), framework="pt") as f:
+            for name in names:
+                named_tensors[name] = f.get_tensor(name)
+    return named_tensors
+def req_inference(
+    endpoint: str,
+    inference_parallel_size: int,
+    timeout: float = 300.0,
+    uds: str | None = None,
+    weight_version: str | None = None,
+) -> Callable[[list[tuple[str, str]]], None]:
+    rank = int(os.getenv("RANK", 0))
+    src = rank // inference_parallel_size * inference_parallel_size
+    def req_func(socket_paths: list[tuple[str, str]]):
+        if rank == src:
+            with httpx.Client(transport=httpx.HTTPTransport(uds=uds)) as client:
+                resp = client.post(
+                    f"{endpoint}/update_weights_from_ipc",
+                    json={
+                        "zmq_handles": dict(
+                            socket_paths[src : src + inference_parallel_size]
+                        ),
+                        "flush_cache": True,
+                        "weight_version": weight_version,
+                    },
+                    timeout=timeout,
+                )
+                resp.raise_for_status()
+    return req_func
+def update_weights(
+    ps,
+    checkpoint_name: str,
+    checkpoint_files: list[str],
+    named_tensors: dict[str, torch.Tensor],
+    req_func: Callable[[list[tuple[str, str]]], None],
+    inference_parallel_size: int,
+    endpoint: str,
+    save_metas_file: str | None = None,
+    update_method: Literal["broadcast", "p2p", "all"] = "broadcast",
+    uds: str | None = None,
+):
+    ps.register_checkpoint(
+        checkpoint_name, files=checkpoint_files, named_tensors=named_tensors
+    )
+    ps.init_process_group()
+    check_sglang_ready(endpoint, inference_parallel_size, uds)
+    dist.barrier()
+    with timer("Gather metas"):
+        ps.gather_metas(checkpoint_name)
+    if save_metas_file and int(os.getenv("RANK")) == 0:
+        with open(save_metas_file, "wb") as f:
+            pickle.dump(ps.get_metas(), f)
+    if update_method == "broadcast" or update_method == "all":
+        with timer("Update weights without setting ranks"):
+            ps.update(checkpoint_name, req_func)
+    if update_method == "p2p" or update_method == "all":
+        if update_method:
+            # sleep 2s to wait destroy process group
+            time.sleep(2)
+        with timer("Update weights with setting ranks"):
+            ps.update(
+                checkpoint_name, req_func, ranks=list(range(inference_parallel_size))
+            )
+def join(
+    ps: ParameterServer,
+    checkpoint_name: str,
+    load_metas_file: str,
+    req_func: Callable[[list[tuple[str, str]]], None],
+    inference_parallel_size: int,
+    endpoint: str,
+    uds: str | None = None,
+):
+    assert load_metas_file, "load_metas_file is required"
+    with open(load_metas_file, "rb") as f:
+        metas = pickle.load(f)
+    ps.init_process_group()
+    check_sglang_ready(endpoint, inference_parallel_size, uds)
+    dist.barrier()
+    with timer("Gather metas before join"):
+        ps.gather_metas(checkpoint_name)
+    ps.load_metas(metas)
+    with timer(
+        f"Update weights with setting ranks as range(0, {inference_parallel_size}) by using p2p"
+    ):
+        ps.update(checkpoint_name, req_func, ranks=list(range(inference_parallel_size)))
+def run_with_torchrun():
+    """Run the update script with torchrun automatically."""
+    # Parse inference_parallel_size from command line arguments to determine nproc-per-node
+    inference_parallel_size = 8  # default
+    args = sys.argv[1:]  # Skip the script name
+    # Look for --inference-parallel-size in arguments
+    for i, arg in enumerate(args):
+        if arg == "--inference-parallel-size" and i + 1 < len(args):
+            try:
+                inference_parallel_size = int(args[i + 1])
+            except ValueError:
+                pass
+            break
+        elif arg.startswith("--inference-parallel-size="):
+            try:
+                inference_parallel_size = int(arg.split("=", 1)[1])
+            except ValueError:
+                pass
+            break
+    # Build torchrun command
+    cmd = ["torchrun", f"--nproc-per-node={inference_parallel_size}", __file__] + args
+    print(f"Running: {' '.join(cmd)}", file=sys.stderr)
+    # Execute torchrun with the original script
+    try:
+        result = subprocess.run(cmd, check=False)
+        sys.exit(result.returncode)
+    except FileNotFoundError:
+        print(
+            "Error: torchrun command not found. Please ensure PyTorch is installed.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    except KeyboardInterrupt:
+        print("\nInterrupted by user", file=sys.stderr)
+        sys.exit(130)
+def main():
+    # Check if we're running under torchrun or need to invoke it
+    if os.getenv("RANK") is None:
+        # Not running under torchrun, so invoke it
+        run_with_torchrun()
+        return
+    # Running under torchrun, proceed with normal execution
+    parser = argparse.ArgumentParser(description="Update weights example")
+    parser.add_argument("--checkpoint-path", type=str, default=None)
+    parser.add_argument("--save-metas-file", type=str, default=None)
+    parser.add_argument("--load-metas-file", type=str, default=None)
+    parser.add_argument("--sleep-time", type=int, default=0)
+    parser.add_argument("--endpoint", type=str, default="http://localhost:19730")
+    parser.add_argument("--inference-parallel-size", type=int, default=8)
+    parser.add_argument("--checkpoint-name", type=str, default="my-checkpoint-iter-0")
+    parser.add_argument("--update-method", type=str, default="broadcast")
+    parser.add_argument("--uds", type=str, default=None)
+    parser.add_argument("--weight-version", type=str, default=None)
+    args = parser.parse_args()
+    # Get rank and world_size from environment (set by torchrun)
+    rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    req_func = req_inference(
+        args.endpoint,
+        args.inference_parallel_size,
+        uds=args.uds,
+        weight_version=args.weight_version,
+    )
+    if ParameterServer is None:
+        print("Error: checkpoint_engine package not available", file=sys.stderr)
+        sys.exit(1)
+    ps = ParameterServer(auto_pg=True)
+    ps._p2p_store = None
+    if args.load_metas_file:
+        join(
+            ps,
+            args.checkpoint_name,
+            args.load_metas_file,
+            req_func,
+            args.inference_parallel_size,
+            args.endpoint,
+            args.uds,
+        )
+    else:
+        if args.checkpoint_path and os.path.exists(
+            os.path.join(args.checkpoint_path, "model.safetensors.index.json")
+        ):
+            named_tensors = split_tensors(args.checkpoint_path, rank, world_size)
+            checkpoint_files = []
+        else:
+            checkpoint_files = (
+                split_checkpoint_files(args.checkpoint_path, rank, world_size)
+                if args.checkpoint_path
+                else []
+            )
+            named_tensors = {}
+        update_weights(
+            ps,
+            args.checkpoint_name,
+            checkpoint_files,
+            named_tensors,
+            req_func,
+            args.inference_parallel_size,
+            args.endpoint,
+            args.save_metas_file,
+            args.update_method,
+            args.uds,
+        )
+    time.sleep(args.sleep_time)
+if __name__ == "__main__":
+    main()

sglang/srt/configs/__init__.py CHANGED Viewed

@@ -6,6 +6,7 @@ from sglang.srt.configs.dots_vlm import DotsVLMConfig
 from sglang.srt.configs.exaone import ExaoneConfig
 from sglang.srt.configs.falcon_h1 import FalconH1Config
 from sglang.srt.configs.janus_pro import MultiModalityConfig
+from sglang.srt.configs.kimi_linear import KimiLinearConfig
 from sglang.srt.configs.kimi_vl import KimiVLConfig
 from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
 from sglang.srt.configs.longcat_flash import LongcatFlashConfig
@@ -31,6 +32,7 @@ __all__ = [
     "Step3TextConfig",
     "Step3VisionEncoderConfig",
     "Olmo3Config",
+    "KimiLinearConfig",
     "Qwen3NextConfig",
     "DotsVLMConfig",
     "DotsOCRConfig",

sglang 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl