PyPI - sglang - Versions diffs - 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

sglang 0.3.3py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

sglang/bench_latency.py +31 -13
sglang/bench_server_latency.py +21 -10
sglang/bench_serving.py +101 -7
sglang/global_config.py +0 -1
sglang/srt/conversation.py +11 -2
sglang/srt/layers/attention/__init__.py +27 -5
sglang/srt/layers/attention/double_sparsity_backend.py +281 -0
sglang/srt/layers/attention/flashinfer_backend.py +352 -83
sglang/srt/layers/attention/triton_backend.py +6 -4
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +772 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -3
sglang/srt/layers/attention/triton_ops/prefill_attention.py +4 -2
sglang/srt/layers/sampler.py +6 -2
sglang/srt/managers/data_parallel_controller.py +177 -0
sglang/srt/managers/detokenizer_manager.py +31 -10
sglang/srt/managers/io_struct.py +11 -2
sglang/srt/managers/schedule_batch.py +126 -43
sglang/srt/managers/schedule_policy.py +2 -1
sglang/srt/managers/scheduler.py +245 -142
sglang/srt/managers/tokenizer_manager.py +14 -1
sglang/srt/managers/tp_worker.py +111 -1
sglang/srt/mem_cache/chunk_cache.py +8 -4
sglang/srt/mem_cache/memory_pool.py +77 -4
sglang/srt/mem_cache/radix_cache.py +15 -7
sglang/srt/model_executor/cuda_graph_runner.py +4 -4
sglang/srt/model_executor/forward_batch_info.py +16 -21
sglang/srt/model_executor/model_runner.py +100 -36
sglang/srt/models/baichuan.py +2 -3
sglang/srt/models/chatglm.py +5 -6
sglang/srt/models/commandr.py +1 -2
sglang/srt/models/dbrx.py +1 -2
sglang/srt/models/deepseek.py +4 -5
sglang/srt/models/deepseek_v2.py +5 -6
sglang/srt/models/exaone.py +1 -2
sglang/srt/models/gemma.py +2 -2
sglang/srt/models/gemma2.py +5 -5
sglang/srt/models/gpt_bigcode.py +5 -5
sglang/srt/models/grok.py +1 -2
sglang/srt/models/internlm2.py +1 -2
sglang/srt/models/llama.py +1 -2
sglang/srt/models/llama_classification.py +1 -2
sglang/srt/models/llama_reward.py +2 -3
sglang/srt/models/llava.py +4 -8
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +1 -2
sglang/srt/models/minicpm3.py +5 -6
sglang/srt/models/mixtral.py +1 -2
sglang/srt/models/mixtral_quant.py +1 -2
sglang/srt/models/olmo.py +352 -0
sglang/srt/models/olmoe.py +1 -2
sglang/srt/models/qwen.py +1 -2
sglang/srt/models/qwen2.py +1 -2
sglang/srt/models/qwen2_moe.py +4 -5
sglang/srt/models/stablelm.py +1 -2
sglang/srt/models/torch_native_llama.py +1 -2
sglang/srt/models/xverse.py +1 -2
sglang/srt/models/xverse_moe.py +4 -5
sglang/srt/models/yivl.py +1 -2
sglang/srt/openai_api/adapter.py +97 -52
sglang/srt/openai_api/protocol.py +10 -2
sglang/srt/sampling/penaltylib/orchestrator.py +28 -9
sglang/srt/sampling/sampling_batch_info.py +105 -59
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server.py +171 -37
sglang/srt/server_args.py +127 -48
sglang/srt/utils.py +37 -14
sglang/test/few_shot_gsm8k.py +4 -1
sglang/test/few_shot_gsm8k_engine.py +144 -0
sglang/test/srt/sampling/penaltylib/utils.py +16 -12
sglang/version.py +1 -1
{sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/METADATA +82 -32
sglang-0.3.4.dist-info/RECORD +143 -0
{sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/WHEEL +1 -1
sglang/srt/layers/attention/flashinfer_utils.py +0 -237
sglang-0.3.3.dist-info/RECORD +0 -139
{sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/LICENSE +0 -0
{sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -35,11 +35,12 @@ class ServerArgs:
     tokenizer_mode: str = "auto"
     skip_tokenizer_init: bool = False
     load_format: str = "auto"
+    trust_remote_code: bool = True
     dtype: str = "auto"
     kv_cache_dtype: str = "auto"
-    trust_remote_code: bool = True
-    context_length: Optional[int] = None
     quantization: Optional[str] = None
+    context_length: Optional[int] = None
+    device: str = "cuda"
     served_model_name: Optional[str] = None
     chat_template: Optional[str] = None
     is_embedding: bool = False
@@ -72,6 +73,7 @@ class ServerArgs:
     # Other
     api_key: Optional[str] = None
     file_storage_pth: str = "SGLang_storage"
+    enable_cache_report: bool = False
     # Data parallelism
     dp_size: int = 1
@@ -85,10 +87,23 @@ class ServerArgs:
     # Model override args in JSON
     json_model_override_args: str = "{}"
-    # Optimization/debug options
+    # Double Sparsity
+    enable_double_sparsity: bool = False
+    ds_channel_config_path: str = None
+    ds_heavy_channel_num: int = 32
+    ds_heavy_token_num: int = 256
+    ds_heavy_channel_type: str = "qk"
+    ds_sparse_decode_threshold: int = 4096
+    # LoRA
+    lora_paths: Optional[List[str]] = None
+    max_loras_per_batch: int = 8
+    # Kernel backend
     attention_backend: Optional[str] = None
     sampling_backend: Optional[str] = None
+    # Optimization/debug options
     disable_flashinfer: bool = False
     disable_flashinfer_sampling: bool = False
     disable_radix_cache: bool = False
@@ -98,16 +113,16 @@ class ServerArgs:
     disable_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
     disable_mla: bool = False
+    disable_penalizer: bool = False
+    disable_nan_detection: bool = False
+    enable_overlap_schedule: bool = False
     enable_mixed_chunk: bool = False
     enable_torch_compile: bool = False
     max_torch_compile_bs: int = 32
     torchao_config: str = ""
     enable_p2p_check: bool = False
     triton_attention_reduce_in_fp32: bool = False
-    # LoRA
-    lora_paths: Optional[List[str]] = None
-    max_loras_per_batch: int = 8
+    num_continuous_decode_steps: int = 1
     def __post_init__(self):
         # Set missing default values
@@ -223,6 +238,11 @@ class ServerArgs:
             '"dummy" will initialize the weights with random values, '
             "which is mainly for profiling.",
         )
+        parser.add_argument(
+            "--trust-remote-code",
+            action="store_true",
+            help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
+        )
         parser.add_argument(
             "--dtype",
             type=str,
@@ -244,17 +264,6 @@ class ServerArgs:
             choices=["auto", "fp8_e5m2"],
             help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" is supported for CUDA 11.8+.',
         )
-        parser.add_argument(
-            "--trust-remote-code",
-            action="store_true",
-            help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
-        )
-        parser.add_argument(
-            "--context-length",
-            type=int,
-            default=ServerArgs.context_length,
-            help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
-        )
         parser.add_argument(
             "--quantization",
             type=str,
@@ -270,6 +279,19 @@ class ServerArgs:
             ],
             help="The quantization method.",
         )
+        parser.add_argument(
+            "--context-length",
+            type=int,
+            default=ServerArgs.context_length,
+            help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
+        )
+        parser.add_argument(
+            "--device",
+            type=str,
+            default="cuda",
+            choices=["cuda", "xpu"],
+            help="The device type.",
+        )
         parser.add_argument(
             "--served-model-name",
             type=str,
@@ -390,6 +412,11 @@ class ServerArgs:
             default=ServerArgs.file_storage_pth,
             help="The path of the file storage in backend.",
         )
+        parser.add_argument(
+            "--enable-cache-report",
+            action="store_true",
+            help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
+        )
         # Data parallelism
         parser.add_argument(
@@ -432,7 +459,60 @@ class ServerArgs:
             default=ServerArgs.json_model_override_args,
         )
-        # Optimization/debug options
+        # Double Sparsity
+        parser.add_argument(
+            "--enable-double-sparsity",
+            action="store_true",
+            help="Enable double sparsity attention",
+        )
+        parser.add_argument(
+            "--ds-channel-config-path",
+            type=str,
+            default=ServerArgs.ds_channel_config_path,
+            help="The path of the double sparsity channel config",
+        )
+        parser.add_argument(
+            "--ds-heavy-channel-num",
+            type=int,
+            default=ServerArgs.ds_heavy_channel_num,
+            help="The number of heavy channels in double sparsity attention",
+        )
+        parser.add_argument(
+            "--ds-heavy-token-num",
+            type=int,
+            default=ServerArgs.ds_heavy_token_num,
+            help="The number of heavy tokens in double sparsity attention",
+        )
+        parser.add_argument(
+            "--ds-heavy-channel-type",
+            type=str,
+            default=ServerArgs.ds_heavy_channel_type,
+            help="The type of heavy channels in double sparsity attention",
+        )
+        parser.add_argument(
+            "--ds-sparse-decode-threshold",
+            type=int,
+            default=ServerArgs.ds_sparse_decode_threshold,
+            help="The type of heavy channels in double sparsity attention",
+        )
+        # LoRA
+        parser.add_argument(
+            "--lora-paths",
+            type=str,
+            nargs="*",
+            default=None,
+            action=LoRAPathAction,
+            help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}",
+        )
+        parser.add_argument(
+            "--max-loras-per-batch",
+            type=int,
+            default=8,
+            help="Maximum number of adapters for a running batch, include base-only request",
+        )
+        # Kernel backend
         parser.add_argument(
             "--attention-backend",
             type=str,
@@ -447,6 +527,8 @@ class ServerArgs:
             default=ServerArgs.sampling_backend,
             help="Choose the kernels for sampling layers.",
         )
+        # Optimization/debug options
         parser.add_argument(
             "--disable-flashinfer",
             action="store_true",
@@ -493,6 +575,21 @@ class ServerArgs:
             action="store_true",
             help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
         )
+        parser.add_argument(
+            "--disable-penalizer",
+            action="store_true",
+            help="Disable the logit penalizers (e.g., frequency and repetition penalty) for better performance if they are not used in any requests.",
+        )
+        parser.add_argument(
+            "--disable-nan-detection",
+            action="store_true",
+            help="Disable the NaN detection for better performance.",
+        )
+        parser.add_argument(
+            "--enable-overlap-schedule",
+            action="store_true",
+            help="Overlap the CPU scheduler with GPU model worker. Experimental feature.",
+        )
         parser.add_argument(
             "--enable-mixed-chunk",
             action="store_true",
@@ -527,25 +624,12 @@ class ServerArgs:
             "This only affects Triton attention kernels.",
         )
         parser.add_argument(
-            "--efficient-weight-load",
-            action="store_true",
-            help="Turn on memory efficient weight loading with quantization (quantize per layer during loading).",
-        )
-        # LoRA options
-        parser.add_argument(
-            "--lora-paths",
-            type=str,
-            nargs="*",
-            default=None,
-            action=LoRAPathAction,
-            help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}",
-        )
-        parser.add_argument(
-            "--max-loras-per-batch",
+            "--num-continuous-decode-steps",
             type=int,
-            default=8,
-            help="Maximum number of adapters for a running batch, include base-only request",
+            default=ServerArgs.num_continuous_decode_steps,
+            help="Run multiple continuous decoding steps to reduce scheduling overhead. "
+            "This can potentially increase throughput but may also increase time-to-first-token latency. "
+            "The default value is 1, meaning only run one decoding step at a time.",
         )
     @classmethod
@@ -566,7 +650,7 @@ class ServerArgs:
             self.tp_size % self.nnodes == 0
         ), "tp_size must be divisible by number of nodes"
         assert not (
-            self.dp_size > 1 and self.node_rank is not None
+            self.dp_size > 1 and self.nnodes != 1
         ), "multi-node data parallel is not supported"
         assert (
             self.max_loras_per_batch > 0
@@ -575,11 +659,6 @@ class ServerArgs:
             and (self.lora_paths is None or self.disable_radix_cache)
         ), "compatibility of lora and cuda graph and radix attention is in progress"
-        assert self.dp_size == 1, (
-            "The support for data parallelism is temporarily disabled during refactor. "
-            "Please use sglang<=0.3.2 or wait for later updates."
-        )
         if isinstance(self.lora_paths, list):
             lora_paths = self.lora_paths
             self.lora_paths = {}
@@ -618,11 +697,11 @@ class PortArgs:
     # The ipc filename for detokenizer to receive inputs from scheduler (zmq)
     detokenizer_ipc_name: str
-    # The port for nccl initialization for multiple TP groups (torch.dist)
-    nccl_ports: List[int]
+    # The port for nccl initialization (torch.dist)
+    nccl_port: int
-    @classmethod
-    def init_new(self, server_args):
+    @staticmethod
+    def init_new(server_args) -> "PortArgs":
         port = server_args.port + 1
         while True:
             if is_port_available(port):
@@ -633,7 +712,7 @@ class PortArgs:
             tokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
             scheduler_input_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
             detokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
-            nccl_ports=[port],
+            nccl_port=port,
         )

sglang/srt/utils.py CHANGED Viewed

@@ -35,7 +35,7 @@ import psutil
 import requests
 import torch
 import torch.distributed as dist
-from fastapi.responses import JSONResponse
+from fastapi.responses import ORJSONResponse
 from packaging import version as pkg_version
 from torch import nn
 from torch.profiler import ProfilerActivity, profile, record_function
@@ -140,26 +140,41 @@ def calculate_time(show=False, min_cost_ms=0.0):
     return wrapper
-def get_available_gpu_memory(gpu_id, distributed=False):
+def get_available_gpu_memory(device, gpu_id, distributed=False):
     """
     Get available memory for cuda:gpu_id device.
     When distributed is True, the available memory is the minimum available memory of all GPUs.
     """
-    num_gpus = torch.cuda.device_count()
-    assert gpu_id < num_gpus
+    if device == "cuda":
+        num_gpus = torch.cuda.device_count()
+        assert gpu_id < num_gpus
+        if torch.cuda.current_device() != gpu_id:
+            print(
+                f"WARNING: current device is not {gpu_id}, but {torch.cuda.current_device()}, ",
+                "which may cause useless memory allocation for torch CUDA context.",
+            )
-    if torch.cuda.current_device() != gpu_id:
-        print(
-            f"WARNING: current device is not {gpu_id}, but {torch.cuda.current_device()}, ",
-            "which may cause useless memory allocation for torch CUDA context.",
-        )
+        torch.cuda.empty_cache()
+        free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
-    torch.cuda.empty_cache()
-    free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
+    elif device == "xpu":
+        num_gpus = torch.xpu.device_count()
+        assert gpu_id < num_gpus
+        if torch.xpu.current_device() != gpu_id:
+            print(
+                f"WARNING: current device is not {gpu_id}, but {torch.xpu.current_device()}, ",
+                "which may cause useless memory allocation for torch XPU context.",
+            )
+        torch.xpu.empty_cache()
+        used_memory = torch.xpu.memory_allocated()
+        total_gpu_memory = torch.xpu.get_device_properties(gpu_id).total_memory
+        free_gpu_memory = total_gpu_memory - used_memory
     if distributed:
         tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
-            torch.device("cuda", gpu_id)
+            torch.device(device, gpu_id)
         )
         torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.MIN)
         free_gpu_memory = tensor.item()
@@ -551,7 +566,7 @@ def add_api_key_middleware(app, api_key: str):
         if request.url.path.startswith("/health"):
             return await call_next(request)
         if request.headers.get("Authorization") != "Bearer " + api_key:
-            return JSONResponse(content={"error": "Unauthorized"}, status_code=401)
+            return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
         return await call_next(request)
@@ -569,10 +584,11 @@ def prepare_model_and_tokenizer(model_path: str, tokenizer_path: str):
 def configure_logger(server_args, prefix: str = ""):
     format = f"[%(asctime)s{prefix}] %(message)s"
+    # format = f"[%(asctime)s.%(msecs)03d{prefix}] %(message)s"
     logging.basicConfig(
         level=getattr(logging, server_args.log_level.upper()),
         format=format,
-        datefmt="%H:%M:%S",
+        datefmt="%Y-%m-%d %H:%M:%S",
         force=True,
     )
@@ -675,3 +691,10 @@ def pytorch_profile(name, func, *args, data_size=-1):
     prof.export_chrome_trace(f"trace/{name}_{step_counter}.json")
     step_counter += 1
     return result
+def first_rank_print(*args, **kwargs):
+    if torch.cuda.current_device() == 0:
+        print(*args, **kwargs)
+    else:
+        pass

sglang/test/few_shot_gsm8k.py CHANGED Viewed

@@ -76,7 +76,9 @@ def run_eval(args):
     def few_shot_gsm8k(s, question):
         s += few_shot_examples + question
         s += sgl.gen(
-            "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"]
+            "answer",
+            max_tokens=args.max_new_tokens,
+            stop=["Question", "Assistant:", "<|separator|>"],
         )
     #####################################
@@ -131,6 +133,7 @@ if __name__ == "__main__":
     parser.add_argument("--num-shots", type=int, default=5)
     parser.add_argument("--data-path", type=str, default="test.jsonl")
     parser.add_argument("--num-questions", type=int, default=200)
+    parser.add_argument("--max-new-tokens", type=int, default=512)
     parser.add_argument("--parallel", type=int, default=128)
     parser.add_argument("--host", type=str, default="http://127.0.0.1")
     parser.add_argument("--port", type=int, default=30000)

sglang/test/few_shot_gsm8k_engine.py ADDED Viewed

@@ -0,0 +1,144 @@
+import argparse
+import ast
+import asyncio
+import json
+import re
+import time
+import numpy as np
+import sglang as sgl
+from sglang.api import set_default_backend
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl
+INVALID = -9999999
+def get_one_example(lines, i, include_answer):
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+async def concurrent_generate(engine, prompts, sampling_param):
+    tasks = []
+    for prompt in prompts:
+        tasks.append(asyncio.create_task(engine.async_generate(prompt, sampling_param)))
+    outputs = await asyncio.gather(*tasks)
+    return outputs
+def run_eval(args):
+    # Select backend
+    engine = sgl.Engine(model_path=args.model_path, log_level="error")
+    if args.local_data_path is None:
+        # Read data
+        url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+        filename = download_and_cache_file(url)
+    else:
+        filename = args.local_data_path
+    lines = list(read_jsonl(filename))
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q} for q in questions]
+    # construct the prompts
+    prompts = []
+    for i, arg in enumerate(arguments):
+        q = arg["question"]
+        prompt = few_shot_examples + q
+        prompts.append(prompt)
+    sampling_param = {
+        "stop": ["Question", "Assistant:", "<|separator|>"],
+        "max_new_tokens": 512,
+        "temperature": 0,
+    }
+    # Run requests
+    tic = time.time()
+    loop = asyncio.get_event_loop()
+    outputs = loop.run_until_complete(
+        concurrent_generate(engine, prompts, sampling_param)
+    )
+    # End requests
+    latency = time.time() - tic
+    # Shutdown the engine
+    engine.shutdown()
+    # Parse output
+    preds = []
+    for output in outputs:
+        preds.append(get_answer_value(output["text"]))
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+    # Compute speed
+    num_output_tokens = sum(
+        output["meta_info"]["completion_tokens"] for output in outputs
+    )
+    output_throughput = num_output_tokens / latency
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+    return {
+        "accuracy": acc,
+        "latency": latency,
+        "output_throughput": output_throughput,
+    }
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model-path", type=str, default="meta-llama/Meta-Llama-3.1-8B-Instruct"
+    )
+    parser.add_argument("--local-data-path", type=Optional[str], default=None)
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = parser.parse_args()
+    metrics = run_eval(args)

sglang/test/srt/sampling/penaltylib/utils.py CHANGED Viewed

@@ -164,19 +164,20 @@ class BaseBatchedPenalizerTest(unittest.TestCase):
                             msg=f"key={key}\nactual={getattr(penalizer, key)}\nexpected={tensor}",
                         )
-                actual = orchestrator.apply(
-                    torch.ones(
-                        size=(len(case.test_subjects), self.vocab_size),
-                        dtype=torch.float32,
-                        device=self.device,
-                    )
+                original = torch.ones(
+                    size=(len(case.test_subjects), self.vocab_size),
+                    dtype=torch.float32,
+                    device=self.device,
                 )
+                actual = orchestrator.apply(original.clone())
                 expected = torch.cat(
                     tensors=[
                         subject.steps[0].expected_logits
                         for subject in case.test_subjects
                     ],
                 )
+                if actual is None:
+                    actual = original
                 torch.testing.assert_close(
                     actual=actual,
                     expected=expected,
@@ -226,6 +227,8 @@ class BaseBatchedPenalizerTest(unittest.TestCase):
                         device=self.device,
                     )
                 )
+                if actual_logits is None:
+                    continue
                 filtered_expected_logits = torch.cat(
                     tensors=[
                         subject.steps[0].expected_logits
@@ -317,19 +320,20 @@ class BaseBatchedPenalizerTest(unittest.TestCase):
                                 msg=f"key={key}\nactual={getattr(penalizer, key)}\nexpected={tensor}",
                             )
-                    actual_logits = orchestrator.apply(
-                        torch.ones(
-                            size=(len(filtered_subjects), self.vocab_size),
-                            dtype=torch.float32,
-                            device=self.device,
-                        )
+                    original = torch.ones(
+                        size=(len(filtered_subjects), self.vocab_size),
+                        dtype=torch.float32,
+                        device=self.device,
                     )
+                    actual_logits = orchestrator.apply(original.clone())
                     filtered_expected_logits = torch.cat(
                         tensors=[
                             subject.steps[i].expected_logits
                             for subject in filtered_subjects
                         ],
                     )
+                    if actual_logits is None:
+                        actual_logits = original
                     torch.testing.assert_close(
                         actual=actual_logits,
                         expected=filtered_expected_logits,

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.3"
1	+ __version__ = "0.3.4"

sglang 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl

sglang 0.3.3py3-none-any.whl → 0.3.4py3-none-any.whl