PyPI - sglang - Versions diffs - 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl - Mend

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

sglang/bench_offline_throughput.py +10 -8
sglang/bench_one_batch.py +7 -6
sglang/bench_one_batch_server.py +157 -21
sglang/bench_serving.py +137 -59
sglang/compile_deep_gemm.py +5 -5
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +78 -78
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +2 -2
sglang/srt/configs/model_config.py +40 -28
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +69 -43
sglang/srt/conversation.py +49 -44
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +129 -135
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
sglang/srt/disaggregation/fake/conn.py +3 -13
sglang/srt/disaggregation/kv_events.py +357 -0
sglang/srt/disaggregation/mini_lb.py +57 -24
sglang/srt/disaggregation/mooncake/conn.py +238 -122
sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
sglang/srt/disaggregation/nixl/conn.py +10 -19
sglang/srt/disaggregation/prefill.py +132 -47
sglang/srt/disaggregation/utils.py +123 -6
sglang/srt/distributed/utils.py +3 -3
sglang/srt/entrypoints/EngineBase.py +5 -0
sglang/srt/entrypoints/engine.py +44 -9
sglang/srt/entrypoints/http_server.py +23 -6
sglang/srt/entrypoints/http_server_engine.py +5 -2
sglang/srt/function_call/base_format_detector.py +250 -0
sglang/srt/function_call/core_types.py +34 -0
sglang/srt/function_call/deepseekv3_detector.py +157 -0
sglang/srt/function_call/ebnf_composer.py +234 -0
sglang/srt/function_call/function_call_parser.py +175 -0
sglang/srt/function_call/llama32_detector.py +74 -0
sglang/srt/function_call/mistral_detector.py +84 -0
sglang/srt/function_call/pythonic_detector.py +163 -0
sglang/srt/function_call/qwen25_detector.py +67 -0
sglang/srt/function_call/utils.py +35 -0
sglang/srt/hf_transformers_utils.py +46 -7
sglang/srt/layers/attention/aiter_backend.py +513 -0
sglang/srt/layers/attention/flashattention_backend.py +64 -18
sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
sglang/srt/layers/attention/flashmla_backend.py +340 -78
sglang/srt/layers/attention/triton_backend.py +3 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/utils.py +6 -4
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/communicator.py +451 -0
sglang/srt/layers/dp_attention.py +61 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/cutlass_moe.py +207 -0
sglang/srt/layers/moe/ep_moe/kernels.py +34 -12
sglang/srt/layers/moe/ep_moe/layer.py +105 -51
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
sglang/srt/layers/moe/topk.py +67 -10
sglang/srt/layers/multimodal.py +70 -0
sglang/srt/layers/quantization/__init__.py +8 -3
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/deep_gemm.py +77 -74
sglang/srt/layers/quantization/fp8.py +92 -2
sglang/srt/layers/quantization/fp8_kernel.py +3 -3
sglang/srt/layers/quantization/fp8_utils.py +6 -0
sglang/srt/layers/quantization/gptq.py +298 -6
sglang/srt/layers/quantization/int8_kernel.py +20 -7
sglang/srt/layers/quantization/qoq.py +244 -0
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +2 -4
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/deepseek_eplb.py +278 -0
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/eplb_manager.py +55 -0
sglang/srt/managers/expert_distribution.py +704 -56
sglang/srt/managers/expert_location.py +394 -0
sglang/srt/managers/expert_location_dispatch.py +91 -0
sglang/srt/managers/io_struct.py +19 -4
sglang/srt/managers/mm_utils.py +294 -140
sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
sglang/srt/managers/multimodal_processors/internvl.py +14 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
sglang/srt/managers/schedule_batch.py +122 -42
sglang/srt/managers/schedule_policy.py +1 -5
sglang/srt/managers/scheduler.py +205 -138
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +232 -58
sglang/srt/managers/tp_worker.py +12 -9
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/base_prefix_cache.py +3 -0
sglang/srt/mem_cache/chunk_cache.py +3 -1
sglang/srt/mem_cache/hiradix_cache.py +4 -4
sglang/srt/mem_cache/memory_pool.py +76 -52
sglang/srt/mem_cache/multimodal_cache.py +45 -0
sglang/srt/mem_cache/radix_cache.py +58 -5
sglang/srt/metrics/collector.py +314 -39
sglang/srt/mm_utils.py +10 -0
sglang/srt/model_executor/cuda_graph_runner.py +29 -19
sglang/srt/model_executor/expert_location_updater.py +422 -0
sglang/srt/model_executor/forward_batch_info.py +5 -1
sglang/srt/model_executor/model_runner.py +163 -68
sglang/srt/model_loader/loader.py +10 -6
sglang/srt/models/clip.py +5 -1
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +308 -351
sglang/srt/models/exaone.py +8 -3
sglang/srt/models/gemma3_mm.py +70 -33
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llama4.py +15 -8
sglang/srt/models/llava.py +258 -7
sglang/srt/models/mimo_mtp.py +220 -0
sglang/srt/models/minicpmo.py +5 -12
sglang/srt/models/mistral.py +71 -1
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/mllama.py +3 -3
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/qwen2.py +95 -26
sglang/srt/models/qwen2_5_vl.py +8 -0
sglang/srt/models/qwen2_moe.py +330 -60
sglang/srt/models/qwen2_vl.py +6 -0
sglang/srt/models/qwen3.py +52 -10
sglang/srt/models/qwen3_moe.py +411 -48
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/siglip.py +294 -0
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/openai_api/adapter.py +58 -20
sglang/srt/openai_api/protocol.py +6 -8
sglang/srt/operations.py +154 -0
sglang/srt/operations_strategy.py +31 -0
sglang/srt/reasoning_parser.py +3 -3
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +4 -56
sglang/srt/sampling/sampling_params.py +2 -2
sglang/srt/server_args.py +162 -22
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +138 -7
sglang/srt/speculative/eagle_worker.py +69 -21
sglang/srt/utils.py +74 -17
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_cutlass_moe.py +278 -0
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +55 -14
sglang/utils.py +3 -3
sglang/version.py +1 -1
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +23 -13
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +178 -149
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
sglang/srt/function_call_parser.py +0 -858
sglang/srt/platforms/interface.py +0 -371
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
/sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0

sglang/test/send_one.py CHANGED Viewed

@@ -27,6 +27,7 @@ class BenchArgs:
         "Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:"
     )
     image: bool = False
+    many_images: bool = False
     stream: bool = False
     @staticmethod
@@ -48,6 +49,7 @@ class BenchArgs:
         parser.add_argument("--return-logprob", action="store_true")
         parser.add_argument("--prompt", type=str, default=BenchArgs.prompt)
         parser.add_argument("--image", action="store_true")
+        parser.add_argument("--many-images", action="store_true")
         parser.add_argument("--stream", action="store_true")
     @classmethod
@@ -62,6 +64,17 @@ def send_one_prompt(args):
             "Human: Describe this image in a very short sentence.\n\nAssistant:"
         )
         image_data = "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png"
+    elif args.many_images:
+        args.prompt = (
+            "Human: I have one reference image and many images."
+            "Describe their relationship in a very short sentence.\n\nAssistant:"
+        )
+        image_data = [
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
+        ]
     else:
         image_data = None
@@ -74,9 +87,6 @@ def send_one_prompt(args):
             "Write in a format of json.\nAssistant:"
         )
         json_schema = "$$ANY$$"
-        json_schema = (
-            '{"type": "object", "properties": {"population": {"type": "integer"}}}'
-        )
     else:
         json_schema = None

sglang/test/simple_eval_common.py CHANGED Viewed

@@ -140,7 +140,7 @@ class ChatCompletionSampler(SamplerBase):
                     max_tokens=self.max_tokens,
                 )
                 return response.choices[0].message.content
-            # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are reruning MMMU
+            # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
             except openai.BadRequestError as e:
                 print("Bad Request Error", e)
                 return ""

sglang/test/simple_eval_humaneval.py CHANGED Viewed

@@ -121,7 +121,7 @@ class HumanEval(Eval):
                 convo=convo,
                 metrics={
                     f"pass@{k}": estimate_pass_at_k([total], [correct], k)
-                    # this will be aggrated so no need of .mean()
+                    # this will be aggregated so no need of .mean()
                     for k in self._ks_passes
                     if total >= k
                 },

sglang/test/test_cutlass_moe.py ADDED Viewed

@@ -0,0 +1,278 @@
+import argparse
+import time
+import torch
+import triton  # Added import
+import triton.testing  # Added import
+from transformers import AutoConfig
+from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+def get_model_config(tp_size: int):
+    config = AutoConfig.from_pretrained(
+        "deepseek-ai/deepseek-R1", trust_remote_code=True
+    )
+    E = config.n_routed_experts
+    topk = config.num_experts_per_tok
+    intermediate_size = config.moe_intermediate_size
+    shard_intermediate_size = 2 * intermediate_size // tp_size
+    return {
+        "num_experts": E,
+        "topk": topk,
+        "hidden_size": config.hidden_size,
+        "shard_intermediate_size": shard_intermediate_size,
+        "dtype": config.torch_dtype,
+        "block_shape": config.quantization_config["weight_block_size"],
+    }
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    """Converts tensor to FP8 E4M3, scaling values to fit the range."""
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    # Calculate max absolute value safely
+    max_val = torch.max(torch.abs(tensor))
+    # Avoid division by zero if tensor is all zeros
+    if max_val == 0:
+        scale_factor = 1.0
+    else:
+        # Scale factor to bring the max value to finfo.max
+        scale_factor = finfo.max / max_val
+    # Apply scaling
+    scaled_tensor = tensor * scale_factor
+    # Clamp and convert
+    fp8_tensor = scaled_tensor.clamp(min=finfo.min, max=finfo.max).to(
+        dtype=torch.float8_e4m3fn
+    )
+    return fp8_tensor
+def run_test(tp_size, batch_size, model_config, check=False):
+    print(f"\n--- Batch Size: {batch_size} ---")
+    torch.set_default_device("cuda")
+    torch.cuda.manual_seed_all(42)  # For reproducible random numbers
+    E = model_config["num_experts"]
+    topk = model_config["topk"]
+    H = model_config["hidden_size"]
+    I = model_config["shard_intermediate_size"]
+    block_shape = model_config["block_shape"]  # Tuple (BLOCK_N, BLOCK_K)
+    dtype = model_config["dtype"]  # e.g., torch.bfloat16
+    print(
+        f"Config: E={E}, topk={topk}, H={H}, I_shard={I}, dtype={dtype}, block_shape={block_shape}"
+    )
+    # --- Input Data ---
+    # Use bf16/fp16 for input activation based on model config
+    x = torch.randn((batch_size, H), device="cuda", dtype=dtype) * 0.0001
+    # --- Weights (Generate in higher precision, then convert to FP8) ---
+    # Generate weights suitable for FP8 conversion (e.g., scaled appropriately)
+    w1_hp = (
+        torch.randn((E, I, H), device="cuda", dtype=torch.float32) * 0.00001 + 0.00001
+    )
+    w2_hp = (
+        torch.randn((E, H, I // 2), device="cuda", dtype=torch.float32) * 0.00001
+        + 0.00001
+    )
+    w1 = to_fp8(w1_hp)
+    w2 = to_fp8(w2_hp)
+    # --- Scales for FP8 Weights ---
+    block_n, block_k = block_shape
+    # Calculate number of blocks needed
+    w1_blocks_dim1 = (I + block_n - 1) // block_n
+    w1_blocks_dim2 = (H + block_k - 1) // block_k
+    w2_blocks_dim1 = (H + block_n - 1) // block_n
+    w2_blocks_dim2 = (I // 2 + block_k - 1) // block_k
+    # Scales are typically float32 or float16/bfloat16
+    scale_dtype = torch.float32  # Or dtype if scales match model dtype
+    w1_scale = torch.full(
+        (E, w1_blocks_dim1, w1_blocks_dim2), 1, device="cuda", dtype=scale_dtype
+    )  # Avoid zero scales
+    w2_scale = torch.full(
+        (E, w2_blocks_dim1, w2_blocks_dim2), 1, device="cuda", dtype=scale_dtype
+    )  # Avoid zero scales
+    # --- Routing Information ---
+    topk_weights = torch.softmax(
+        torch.rand(batch_size, topk, device="cuda", dtype=dtype), dim=-1
+    )
+    topk_ids = torch.randint(0, E, (batch_size, topk), dtype=torch.int32, device="cuda")
+    a1_strides = torch.full((E,), H, dtype=torch.int64, device="cuda")
+    c1_strides = torch.full((E,), I, dtype=torch.int64, device="cuda")
+    a2_strides = torch.full((E,), I // 2, dtype=torch.int64, device="cuda")
+    c2_strides = torch.full((E,), H, dtype=torch.int64, device="cuda")
+    workspace = torch.empty(
+        (7182 * 1024), device="cuda", dtype=torch.uint8
+    )  # Allocate sufficient workspace
+    # Pointer arrays (often filled by the kernel or a prep step, but needed as args)
+    a_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    b_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    out_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    a_scales_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    b_scales_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    expert_offsets = torch.empty((E + 1,), dtype=torch.int32, device="cuda")
+    problem_sizes1 = torch.empty((E, 3), dtype=torch.int32, device="cuda")
+    problem_sizes2 = torch.empty((E, 3), dtype=torch.int32, device="cuda")
+    # --- Lambdas for Benchmarking ---
+    cutlass_lambda = lambda: cutlass_fused_experts(
+        x,
+        w1.transpose(1, 2),  # Transposed
+        w2.transpose(1, 2),  # Transposed
+        w1_scale.transpose(1, 2),
+        w2_scale.transpose(1, 2),
+        topk_weights,
+        topk_ids,
+        a1_strides,
+        c1_strides,
+        a2_strides,
+        c2_strides,
+        workspace,
+        a_ptrs,
+        b_ptrs,
+        out_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+    )
+    # Note: Triton expects non-transposed weights
+    triton_lambda = lambda: fused_experts(
+        x,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        inplace=False,  # Use False for benchmarking to avoid side effects if run multiple times
+        activation="silu",  # Assuming SiLU activation common in MoEs
+        use_fp8_w8a8=True,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        block_shape=block_shape,
+    )
+    # --- Warmup ---
+    print("Warming up...")
+    for _ in range(10):
+        _ = cutlass_lambda()
+        _ = triton_lambda()
+    torch.cuda.synchronize()
+    # --- Benchmarking ---
+    quantiles = [0.5, 0.2, 0.8]
+    print(f"Benchmarking Cutlass fused_experts...")
+    cutlass_ms, cutlass_min, cutlass_max = triton.testing.do_bench_cudagraph(
+        cutlass_lambda, rep=1000, quantiles=quantiles
+    )
+    print(f"Benchmarking Triton fused_experts...")
+    triton_ms, triton_min, triton_max = triton.testing.do_bench_cudagraph(
+        triton_lambda, rep=1000, quantiles=quantiles
+    )
+    print(
+        f"Cutlass fused_experts time: {cutlass_ms:.3f} ms (median) [{cutlass_min:.3f} - {cutlass_max:.3f}]"
+    )
+    print(
+        f"Triton  fused_experts time: {triton_ms:.3f} ms (median) [{triton_min:.3f} - {triton_max:.3f}]"
+    )
+    # --- Correctness Check ---
+    if check:
+        print("Running correctness check...")
+        with torch.no_grad():
+            # Run CUTLASS version (requires transposed weights)
+            y_cutlass = cutlass_fused_experts(
+                x,
+                w1.transpose(1, 2),  # Transposed
+                w2.transpose(1, 2),  # Transposed
+                w1_scale.transpose(1, 2),
+                w2_scale.transpose(1, 2),
+                topk_weights,
+                topk_ids,
+                a1_strides,
+                c1_strides,
+                a2_strides,
+                c2_strides,
+                workspace,
+                a_ptrs,
+                b_ptrs,
+                out_ptrs,
+                a_scales_ptrs,
+                b_scales_ptrs,
+                expert_offsets,
+                problem_sizes1,
+                problem_sizes2,
+            )
+            # Run Triton version (requires original shape weights, use inplace=False)
+            y_triton = fused_experts(
+                x,
+                w1,  # Original shape
+                w2,  # Original shape
+                topk_weights,
+                topk_ids,
+                inplace=False,  # Important: Use False to get output tensor
+                activation="silu",
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                block_shape=block_shape,
+            )
+        # Ensure outputs are same dtype for comparison
+        y_cutlass = y_cutlass.to(dtype)
+        y_triton = y_triton.to(dtype)
+        abs_error = torch.abs(y_cutlass - y_triton)
+        rel_error = abs_error / torch.clamp(torch.abs(y_triton), min=1e-2)
+        max_abs_err = abs_error.max().item()
+        max_rel_err = rel_error.max().item()
+        print("y_cutlass:", y_cutlass[:, :10])
+        print("y_triton:", y_triton[:, :10])
+        print(f"Max absolute error: {max_abs_err:.6f}")
+        print(f"Max relative error: {max_rel_err:.6f}")
+        # Tolerance might need adjustment based on FP8 specifics and kernel differences
+        # FP8 comparisons often require higher tolerance than FP16/BF16
+        assert max_rel_err < 5e-1, f"Relative error too high! {max_rel_err}"
+        print("Correctness check passed.")
+def main(tp_size=8, batch_sizes=[1, 4, 8, 16, 32, 64, 128, 256, 512], check=False):
+    model_config = get_model_config(tp_size)
+    print("Model Config:", model_config)
+    for batch_size in batch_sizes:
+        run_test(tp_size, batch_size, model_config, check)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tp-size", type=int, default=8, help="Tensor Parallel size")
+    parser.add_argument(
+        "--batch-sizes",
+        type=int,
+        nargs="+",
+        default=[1, 4, 8, 16, 32, 64, 128, 256, 512],  # Adjusted default
+        help="List of batch sizes to test",
+    )
+    parser.add_argument("--check", action="store_true", help="Enable check mode")
+    args = parser.parse_args()
+    print(f"Running benchmarks with TP size: {args.tp_size}")
+    print(f"Testing batch sizes: {args.batch_sizes}")
+    main(tp_size=args.tp_size, batch_sizes=args.batch_sizes, check=args.check)

sglang/test/test_programs.py CHANGED Viewed

@@ -370,7 +370,7 @@ def test_dtype_gen():
     @sgl.function
     def dtype_gen(s):
         s += "Q: What is the full name of DNS?\n"
-        s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
+        s += "A: The full names is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
         s += "Q: Which year was DNS invented?\n"
         s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
         s += "Q: What is the value of pi?\n"
@@ -503,7 +503,7 @@ def test_hellaswag_select():
     #####################################
     # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
     rets = few_shot_hellaswag.run_batch(
         arguments,
         temperature=0,
@@ -514,13 +514,13 @@ def test_hellaswag_select():
     preds = []
     for i, ret in enumerate(rets):
         preds.append(choices[i].index(ret["answer"]))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
     # Compute accuracy
     accuracy = np.mean(np.array(preds) == np.array(labels))
     # Test generator style of run_batch
-    tic = time.time()
+    tic = time.perf_counter()
     rets = few_shot_hellaswag.run_batch(
         arguments,
         temperature=0,
@@ -531,7 +531,7 @@ def test_hellaswag_select():
     preds_gen = []
     for i, ret in enumerate(rets):
         preds_gen.append(choices[i].index(ret["answer"]))
-    latency_gen = time.time() - tic
+    latency_gen = time.perf_counter() - tic
     # Compute accuracy
     accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))

sglang/test/test_utils.py CHANGED Viewed

@@ -395,12 +395,12 @@ def popen_launch_server(
     other_args: list[str] = (),
     env: Optional[dict] = None,
     return_stdout_stderr: Optional[tuple] = None,
-    pd_seperated: bool = False,
+    pd_separated: bool = False,
 ):
     _, host, port = base_url.split(":")
     host = host[2:]
-    if pd_seperated:
+    if pd_separated:
         command = "sglang.launch_pd_server"
     else:
         command = "sglang.launch_server"
@@ -414,7 +414,7 @@ def popen_launch_server(
         *[str(x) for x in other_args],
     ]
-    if pd_seperated:
+    if pd_separated:
         command.extend(
             [
                 "--lb-host",
@@ -449,9 +449,9 @@ def popen_launch_server(
     else:
         process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
-    start_time = time.time()
+    start_time = time.perf_counter()
     with requests.Session() as session:
-        while time.time() - start_time < timeout:
+        while time.perf_counter() - start_time < timeout:
             try:
                 headers = {
                     "Content-Type": "application/json; charset=utf-8",
@@ -478,6 +478,47 @@ def popen_launch_server(
     raise TimeoutError("Server failed to start within the timeout period.")
+def popen_launch_pd_server(
+    model: str,
+    base_url: str,
+    timeout: float,
+    api_key: Optional[str] = None,
+    other_args: list[str] = (),
+    env: Optional[dict] = None,
+):
+    _, host, port = base_url.split(":")
+    host = host[2:]
+    command = "sglang.launch_server"
+    command = [
+        "python3",
+        "-m",
+        command,
+        "--model-path",
+        model,
+        *[str(x) for x in other_args],
+    ]
+    command.extend(
+        [
+            "--host",
+            host,
+            "--port",
+            port,
+        ]
+    )
+    if api_key:
+        command += ["--api-key", api_key]
+    print(f"command={' '.join(command)}")
+    process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
+    return process
 def run_with_timeout(
     func: Callable,
     args: tuple = (),
@@ -509,7 +550,7 @@ class TestFile:
 def run_unittest_files(files: List[TestFile], timeout_per_file: float):
-    tic = time.time()
+    tic = time.perf_counter()
     success = True
     for i, file in enumerate(files):
@@ -524,13 +565,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
                 f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
                 flush=True,
             )
-            tic = time.time()
+            tic = time.perf_counter()
             process = subprocess.Popen(
                 ["python3", filename], stdout=None, stderr=None, env=os.environ
             )
             process.wait()
-            elapsed = time.time() - tic
+            elapsed = time.perf_counter() - tic
             print(
                 f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
@@ -556,9 +597,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
             break
     if success:
-        print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
+        print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
     else:
-        print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
+        print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
     return 0 if success else -1
@@ -581,7 +622,7 @@ def get_benchmark_args(
     disable_stream=False,
     disable_ignore_eos=False,
     seed: int = 0,
-    pd_seperated: bool = False,
+    pd_separated: bool = False,
 ):
     return SimpleNamespace(
         backend="sglang",
@@ -611,7 +652,7 @@ def get_benchmark_args(
         profile=None,
         lora_name=None,
         prompt_suffix="",
-        pd_seperated=pd_seperated,
+        pd_separated=pd_separated,
     )
@@ -675,7 +716,7 @@ def run_bench_serving_multi(
     other_server_args,
     benchmark_args,
     need_warmup=False,
-    pd_seperated=False,
+    pd_separated=False,
 ):
     # Launch the server
     process = popen_launch_server(
@@ -683,7 +724,7 @@ def run_bench_serving_multi(
         base_url,
         timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
         other_args=other_server_args,
-        pd_seperated=pd_seperated,
+        pd_separated=pd_separated,
     )
     # run benchmark for all

sglang/utils.py CHANGED Viewed

@@ -278,7 +278,7 @@ def graceful_registry(sub_module_name: str):
             f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
         )
         if signum == signal.SIGTERM:
-            logger.info(f"{sub_module_name} recive sigterm")
+            logger.info(f"{sub_module_name} receive sigterm")
     signal.signal(signal.SIGTERM, graceful_shutdown)
@@ -436,7 +436,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
         base_url: The base URL of the server
         timeout: Maximum time to wait in seconds. None means wait forever.
     """
-    start_time = time.time()
+    start_time = time.perf_counter()
     while True:
         try:
             response = requests.get(
@@ -455,7 +455,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
                 )
                 break
-            if timeout and time.time() - start_time > timeout:
+            if timeout and time.perf_counter() - start_time > timeout:
                 raise TimeoutError("Server did not become ready within timeout period")
         except requests.exceptions.RequestException:
             time.sleep(1)

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.6.~~post3~~"
1	+ __version__ = "0.4.6.post5"

sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post5py3-none-any.whl