PyPI - sglang - Versions diffs - 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl - Mend

sglang 0.3.5.post2py3-none-any.whl → 0.3.6.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

sglang/__init__.py +2 -2
sglang/api.py +2 -2
sglang/bench_latency.py +1 -553
sglang/bench_offline_throughput.py +48 -20
sglang/bench_one_batch.py +472 -0
sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
sglang/bench_serving.py +125 -6
sglang/check_env.py +3 -6
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/runtime_endpoint.py +2 -2
sglang/srt/configs/model_config.py +13 -14
sglang/srt/constrained/__init__.py +13 -14
sglang/srt/constrained/base_grammar_backend.py +13 -15
sglang/srt/constrained/outlines_backend.py +28 -17
sglang/srt/constrained/outlines_jump_forward.py +13 -15
sglang/srt/constrained/xgrammar_backend.py +47 -58
sglang/srt/conversation.py +13 -15
sglang/srt/hf_transformers_utils.py +13 -15
sglang/srt/layers/activation.py +16 -13
sglang/srt/layers/attention/flashinfer_backend.py +106 -54
sglang/srt/layers/attention/triton_backend.py +9 -7
sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
sglang/srt/layers/custom_op_util.py +25 -0
sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +11 -4
sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
sglang/srt/layers/fused_moe_triton/layer.py +633 -0
sglang/srt/layers/layernorm.py +17 -15
sglang/srt/layers/logits_processor.py +23 -25
sglang/srt/layers/quantization/__init__.py +77 -17
sglang/srt/layers/radix_attention.py +13 -15
sglang/srt/layers/rotary_embedding.py +13 -13
sglang/srt/layers/sampler.py +4 -8
sglang/srt/layers/torchao_utils.py +2 -0
sglang/srt/lora/lora.py +13 -14
sglang/srt/lora/lora_config.py +13 -14
sglang/srt/lora/lora_manager.py +22 -24
sglang/srt/managers/data_parallel_controller.py +98 -27
sglang/srt/managers/detokenizer_manager.py +13 -15
sglang/srt/managers/io_struct.py +63 -21
sglang/srt/managers/schedule_batch.py +154 -59
sglang/srt/managers/schedule_policy.py +18 -16
sglang/srt/managers/scheduler.py +278 -109
sglang/srt/managers/session_controller.py +61 -0
sglang/srt/managers/tokenizer_manager.py +63 -18
sglang/srt/managers/tp_worker.py +25 -16
sglang/srt/managers/tp_worker_overlap_thread.py +62 -67
sglang/srt/metrics/collector.py +13 -15
sglang/srt/metrics/func_timer.py +13 -15
sglang/srt/mm_utils.py +13 -14
sglang/srt/model_executor/cuda_graph_runner.py +63 -25
sglang/srt/model_executor/forward_batch_info.py +128 -32
sglang/srt/model_executor/model_runner.py +132 -64
sglang/srt/model_parallel.py +98 -0
sglang/srt/models/chatglm.py +15 -16
sglang/srt/models/commandr.py +15 -16
sglang/srt/models/dbrx.py +15 -16
sglang/srt/models/deepseek.py +15 -15
sglang/srt/models/deepseek_v2.py +162 -59
sglang/srt/models/exaone.py +14 -15
sglang/srt/models/gemma.py +14 -14
sglang/srt/models/gemma2.py +31 -25
sglang/srt/models/gemma2_reward.py +13 -14
sglang/srt/models/gpt_bigcode.py +14 -14
sglang/srt/models/grok.py +15 -15
sglang/srt/models/internlm2.py +13 -15
sglang/srt/models/internlm2_reward.py +13 -14
sglang/srt/models/llama.py +21 -21
sglang/srt/models/llama_classification.py +13 -14
sglang/srt/models/llama_reward.py +13 -14
sglang/srt/models/llava.py +14 -16
sglang/srt/models/llavavid.py +14 -16
sglang/srt/models/minicpm.py +13 -15
sglang/srt/models/minicpm3.py +13 -15
sglang/srt/models/mistral.py +13 -15
sglang/srt/models/mixtral.py +15 -15
sglang/srt/models/mixtral_quant.py +14 -14
sglang/srt/models/olmo.py +22 -20
sglang/srt/models/olmoe.py +23 -20
sglang/srt/models/phi3_small.py +447 -0
sglang/srt/models/qwen.py +14 -14
sglang/srt/models/qwen2.py +22 -19
sglang/srt/models/qwen2_moe.py +17 -18
sglang/srt/models/qwen2_vl.py +13 -6
sglang/srt/models/stablelm.py +18 -16
sglang/srt/models/torch_native_llama.py +107 -93
sglang/srt/models/xverse.py +13 -14
sglang/srt/models/xverse_moe.py +15 -16
sglang/srt/models/yivl.py +13 -15
sglang/srt/openai_api/adapter.py +19 -17
sglang/srt/openai_api/protocol.py +14 -16
sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
sglang/srt/sampling/sampling_batch_info.py +61 -57
sglang/srt/sampling/sampling_params.py +14 -16
sglang/srt/server.py +86 -35
sglang/srt/server_args.py +96 -80
sglang/srt/utils.py +266 -68
sglang/test/few_shot_gsm8k.py +8 -4
sglang/test/runners.py +38 -20
sglang/test/srt/sampling/penaltylib/utils.py +23 -21
sglang/test/test_utils.py +31 -20
sglang/version.py +1 -1
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +66 -57
sglang-0.3.6.post1.dist-info/RECORD +164 -0
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +1 -1
sglang/srt/layers/fused_moe/__init__.py +0 -1
sglang-0.3.5.post2.dist-info/RECORD +0 -156
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0

sglang/srt/utils.py CHANGED Viewed

@@ -1,22 +1,21 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Common utilities."""
 import base64
 import ipaddress
+import itertools
 import json
 import logging
 import os
@@ -33,7 +32,7 @@ import time
 import warnings
 from importlib.metadata import PackageNotFoundError, version
 from io import BytesIO
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
 import numpy as np
 import psutil
@@ -46,6 +45,8 @@ from fastapi.responses import ORJSONResponse
 from packaging import version as pkg_version
 from starlette.routing import Mount
 from torch import nn
+from torch.func import functional_call
+from torch.library import Library
 from torch.profiler import ProfilerActivity, profile, record_function
 from triton.runtime.cache import (
     FileCacheManager,
@@ -71,6 +72,8 @@ def is_flashinfer_available():
     Check whether flashinfer is available.
     As of Oct. 6, 2024, it is only available on NVIDIA GPUs.
     """
+    if os.environ.get("SGLANG_IS_FLASHINFER_AVAILABLE", "true") == "false":
+        return False
     return torch.cuda.is_available() and not is_hip()
@@ -190,6 +193,94 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
     return free_gpu_memory / (1 << 30)
+def is_pin_memory_available() -> bool:
+    return torch.cuda.is_available()
+_CPU_OFFLOAD_BYTES = 0
+_CPU_OFFLOAD_MAX_BYTES = 0
+def set_cpu_offload_max_bytes(max_bytes: int) -> None:
+    global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
+    _CPU_OFFLOAD_BYTES = 0
+    _CPU_OFFLOAD_MAX_BYTES = max_bytes
+def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
+    device = next(module.parameters()).device
+    if device == torch.device("cpu"):
+        return module
+    global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
+    if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
+        return module
+    pin_memory = is_pin_memory_available()
+    # offload parameters to CPU
+    # use pin_memory if possible, which helps cudagraph capture speed
+    offloaded_parameters = False
+    for p in module.parameters():
+        if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
+            # we use per-parameter offloading
+            # one module might have some parameters offloaded and some not
+            break
+        # `torch.empty_like` does not support `pin_memory` argument
+        cpu_data = torch.empty_strided(
+            size=p.data.size(),
+            stride=p.data.stride(),
+            dtype=p.data.dtype,
+            layout=p.data.layout,
+            device="cpu",
+            pin_memory=pin_memory,
+        )
+        cpu_data.copy_(p.data)
+        p.data = cpu_data
+        _CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
+        offloaded_parameters = True
+    if offloaded_parameters:
+        original_forward = module.forward
+        def forward(*args, **kwargs):
+            module.forward = original_forward
+            device_state = {
+                # here we blindly call `to(device)`
+                # if the parameter is already on the device, it will be a no-op
+                k: v.to(device, non_blocking=True)
+                for k, v in module.state_dict().items()
+            }
+            output = functional_call(module, device_state, args=args, kwargs=kwargs)
+            module.forward = forward
+            return output
+        module.forward = forward
+    return module
+class LayerFn(Protocol):
+    def __call__(self, layer_id: int, prefix: str) -> torch.nn.Module: ...
+def make_layers(
+    num_hidden_layers: int,
+    layer_fn: LayerFn,
+    prefix: str = "",
+) -> Tuple[int, int, torch.nn.ModuleList]:
+    """Make a list of layers with the given layer function"""
+    modules = torch.nn.ModuleList(
+        [
+            maybe_offload_to_cpu(layer_fn(idx=idx, prefix=f"{prefix}.{idx}"))
+            for idx in range(num_hidden_layers)
+        ]
+    )
+    return modules
 def set_random_seed(seed: int) -> None:
     """Set the random seed for all libraries."""
     random.seed(seed)
@@ -330,6 +421,7 @@ def suppress_other_loggers():
     )
     logging.getLogger("vllm.selector").setLevel(logging.WARN)
     logging.getLogger("vllm.utils").setLevel(logging.ERROR)
+    logging.getLogger("vllm.model_executor.model_loader.loader").setLevel(logging.ERROR)
     warnings.filterwarnings(
         "ignore", category=UserWarning, message="The given NumPy array is not writable"
@@ -394,6 +486,27 @@ def kill_child_process(pid=None, include_self=False, skip_pid=None):
             pass
+def monkey_patch_vllm_model_config():
+    from vllm.config import ModelConfig
+    if not hasattr(ModelConfig, "_resolve_task"):
+        return
+    def _resolve_task(
+        self,
+        task_option,
+        hf_config,
+    ):
+        supported_tasks = {
+            "generate": True,
+            "embedding": False,
+        }
+        selected_task = "generate"
+        return supported_tasks, selected_task
+    setattr(ModelConfig, "_resolve_task", _resolve_task)
 def monkey_patch_vllm_p2p_access_check(gpu_id: int):
     """
     Monkey patch the slow p2p access check in vllm.
@@ -405,57 +518,6 @@ def monkey_patch_vllm_p2p_access_check(gpu_id: int):
     setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
-def monkey_patch_vllm_dummy_weight_loader():
-    """
-    Monkey patch the dummy weight loader in vllm to call process_weights_after_loading.
-    """
-    from vllm.model_executor.model_loader.loader import (
-        CacheConfig,
-        DeviceConfig,
-        DummyModelLoader,
-        LoRAConfig,
-        ModelConfig,
-        ParallelConfig,
-        SchedulerConfig,
-        _initialize_model,
-        initialize_dummy_weights,
-        nn,
-        set_default_torch_dtype,
-    )
-    def load_model(
-        self,
-        *,
-        model_config: ModelConfig,
-        device_config: DeviceConfig,
-        lora_config: Optional[LoRAConfig],
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        cache_config: CacheConfig,
-    ) -> nn.Module:
-        with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
-                model = _initialize_model(
-                    model_config,
-                    self.load_config,
-                    lora_config,
-                    cache_config,
-                )
-            for _, module in model.named_modules():
-                quant_method = getattr(module, "quant_method", None)
-                if quant_method is not None:
-                    quant_method.process_weights_after_loading(module)
-            # NOTE(woosuk): For accurate performance evaluation, we assign
-            # random values to the weights.
-            initialize_dummy_weights(model)
-        return model.eval()
-    setattr(DummyModelLoader, "load_model", load_model)
 vllm_all_gather_backup = None
@@ -794,7 +856,48 @@ def add_prometheus_middleware(app):
     app.routes.append(metrics_route)
-def get_gpu_memory_capacity():
+def bind_port(port):
+    """Bind to a specific port, assuming it's available."""
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)  # Allows address reuse
+    sock.bind(("", port))
+    sock.listen(1)
+    return sock
+def get_amdgpu_memory_capacity():
+    try:
+        # Run rocm-smi and capture the output
+        result = subprocess.run(
+            ["rocm-smi --showmeminfo vram | grep 'Total Memory' | awk '{print $NF}'"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"rocm-smi error: {result.stderr.strip()}")
+        # Parse the output to extract memory values in MiB
+        memory_values = [
+            float(mem) / 1024 / 1024
+            for mem in result.stdout.strip().split("\n")
+            if re.match(r"^\d+(\.\d+)?$", mem.strip())
+        ]
+        if not memory_values:
+            raise ValueError("No GPU memory values found.")
+        # Return the minimum memory value
+        return min(memory_values)
+    except FileNotFoundError:
+        raise RuntimeError(
+            "rocm-smi not found. Ensure AMD ROCm drivers are installed and accessible."
+        )
+def get_nvgpu_memory_capacity():
     try:
         # Run nvidia-smi and capture the output
         result = subprocess.run(
@@ -824,3 +927,98 @@ def get_gpu_memory_capacity():
         raise RuntimeError(
             "nvidia-smi not found. Ensure NVIDIA drivers are installed and accessible."
         )
+def crash_on_warnings():
+    # Crash on warning if we are running CI tests
+    return os.getenv("SGLANG_IS_IN_CI", "false").lower() == "true"
+def get_device_name(device_id: int = 0) -> str:
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
+        return torch.cuda.get_device_name(device_id)
+    if hasattr(torch, "hip") and torch.hip.is_available():
+        return torch.hip.get_device_name(device_id)
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        return torch.xpu.get_device_name(device_id)
+    if hasattr(torch, "hpu") and torch.hpu.is_available():
+        return torch.hpu.get_device_name(device_id)
+sglang_lib = Library("sglang", "FRAGMENT")  # noqa
+def direct_register_custom_op(
+    op_name: str,
+    op_func: Callable,
+    mutates_args: List[str],
+    fake_impl: Optional[Callable] = None,
+    target_lib: Optional[Library] = None,
+):
+    """
+    `torch.library.custom_op` can have significant overhead because it
+    needs to consider complicated dispatching logic. This function
+    directly registers a custom op and dispatches it to the CUDA backend.
+    See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
+    for more details.
+    By default, the custom op is registered to the vLLM library. If you
+    want to register it to a different library, you can pass the library
+    object to the `target_lib` argument.
+    IMPORTANT: the lifetime of the operator is tied to the lifetime of the
+    library object. If you want to bind the operator to a different library,
+    make sure the library object is alive when the operator is used.
+    """
+    import torch.library
+    if hasattr(torch.library, "infer_schema"):
+        schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
+    else:
+        # for pytorch 2.4
+        import torch._custom_op.impl
+        schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
+    my_lib = target_lib or sglang_lib
+    my_lib.define(op_name + schema_str)
+    my_lib.impl(op_name, op_func, "CUDA")
+    if fake_impl is not None:
+        my_lib._register_fake(op_name, fake_impl)
+def gpu_proc_affinity(
+    tp_size: int,
+    nnodes: int,
+    gpu_id: int,
+):
+    # current process
+    pid = os.getpid()
+    p = psutil.Process(pid)
+    tp_size_per_node = tp_size // nnodes
+    # total physical cores
+    total_pcores = psutil.cpu_count(logical=False)
+    # physical cores per TP (N.B. more Cores than GPUs on node)
+    num_cores_bind = total_pcores // tp_size_per_node
+    # able to handle multiple DP per node
+    start_cpu_id = (gpu_id * num_cores_bind) % total_pcores
+    end_cpu_id = start_cpu_id + num_cores_bind
+    if psutil.cpu_count() != psutil.cpu_count(logical=False):
+        # HT on
+        upper_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
+        lower_cpu_ids = [id + total_pcores for id in range(start_cpu_id, end_cpu_id)]
+        bind_cpu_ids = list(itertools.chain(upper_cpu_ids, lower_cpu_ids))
+    else:
+        # HT off
+        bind_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
+    # set cpu_affinity to current process
+    p.cpu_affinity(bind_cpu_ids)
+    logger.info(f"Process {pid} gpu_id {gpu_id} is running on CPUs: {p.cpu_affinity()}")

sglang/test/few_shot_gsm8k.py CHANGED Viewed

@@ -48,9 +48,13 @@ def run_eval(args):
     # Select backend
     set_default_backend(RuntimeEndpoint(f"{args.host}:{args.port}"))
-    # Read data
-    url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
-    filename = download_and_cache_file(url)
+    if args.data_path is None:
+        # Read data
+        url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+        filename = download_and_cache_file(url)
+    else:
+        filename = args.data_path
     lines = list(read_jsonl(filename))
     # Construct prompts
@@ -131,7 +135,7 @@ def run_eval(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--num-shots", type=int, default=5)
-    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--data-path", type=str)
     parser.add_argument("--num-questions", type=int, default=200)
     parser.add_argument("--max-new-tokens", type=int, default=512)
     parser.add_argument("--parallel", type=int, default=128)

sglang/test/runners.py CHANGED Viewed

@@ -1,17 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 import json
 import multiprocessing as mp
@@ -58,6 +57,28 @@ def get_top_logprobs(logits, k):
     return logprobs
+def _get_sentence_transformer_embedding_model(model_path, torch_dtype):
+    from sentence_transformers import SentenceTransformer
+    from sentence_transformers.util import is_sentence_transformer_model
+    if is_sentence_transformer_model(model_path):
+        model = SentenceTransformer(
+            model_path,
+            model_kwargs={"torch_dtype": torch_dtype},
+        )
+    else:  # if no pre-trained sentence-transformers model
+        from sentence_transformers import models
+        word_embedding_model = models.Transformer(model_path).to(dtype=torch_dtype)
+        pooling_model = models.Pooling(
+            word_embedding_model.get_word_embedding_dimension(),
+            pooling_mode="lasttoken",
+        )
+        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+    return model.cuda()
 @dataclass
 class ModelOutput:
     output_strs: List[str] = None
@@ -114,12 +135,9 @@ class HFRunner:
                 low_cpu_mem_usage=True,
             ).cuda()
         elif self.model_type == "embedding":
-            from sentence_transformers import SentenceTransformer
-            self.model = SentenceTransformer(
-                model_path,
-                model_kwargs={"torch_dtype": torch_dtype},
-            ).cuda()
+            self.model = _get_sentence_transformer_embedding_model(
+                model_path, torch_dtype
+            )
         elif self.model_type == "reward":
             from transformers import AutoModelForSequenceClassification

sglang/test/srt/sampling/penaltylib/utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import dataclasses
 import enum
-import typing
 import unittest
+from typing import Dict, List, Optional, Set, Tuple, Type
 import torch
@@ -16,7 +16,7 @@ from sglang.srt.sampling.penaltylib.orchestrator import (
 class MockSamplingParams:
     frequency_penalty: float = 0.0
     min_new_tokens: int = 0
-    stop_token_ids: typing.List[int] = None
+    stop_token_ids: List[int] = None
     presence_penalty: float = 0.0
     repetition_penalty: float = 1.0
@@ -24,12 +24,12 @@ class MockSamplingParams:
 @dataclasses.dataclass
 class MockTokenizer:
     eos_token_id: int
-    additional_stop_token_ids: typing.Optional[typing.List[int]] = None
+    additional_stop_token_ids: Optional[List[int]] = None
 @dataclasses.dataclass
 class MockReq:
-    origin_input_ids: typing.List[int]
+    origin_input_ids: List[int]
     sampling_params: MockSamplingParams
     tokenizer: MockTokenizer
@@ -42,8 +42,8 @@ class StepType(enum.Enum):
 @dataclasses.dataclass
 class Step:
     type: StepType
-    token_ids: typing.List[int]
-    expected_tensors: typing.Dict[str, torch.Tensor]
+    token_ids: List[int]
+    expected_tensors: Dict[str, torch.Tensor]
     # assume initial logits are all 1
     expected_logits: torch.Tensor
@@ -52,7 +52,7 @@ class Step:
 class Subject:
     sampling_params: MockSamplingParams
     # first step must be input, which will be converted to Req
-    steps: typing.List[Step]
+    steps: List[Step]
     eos_token_id: int = -1
     def __post_init__(self):
@@ -66,7 +66,7 @@ class Subject:
                     f"Expected tensors keys must be the same for all steps. Got {self.steps[i].expected_tensors.keys()} for key={i} and {self.steps[0].expected_tensors.keys()}"
                 )
-    def tensor_keys(self, i: int = 0) -> typing.Set[str]:
+    def tensor_keys(self, i: int = 0) -> Set[str]:
         return set(self.steps[i].expected_tensors.keys())
     def to_req(self) -> MockReq:
@@ -80,7 +80,7 @@ class Subject:
 @dataclasses.dataclass
 class Case:
     enabled: bool
-    test_subjects: typing.List[Subject]
+    test_subjects: List[Subject]
     def __post_init__(self):
         # each test_subjects.steps should have the same expected_tensors.keys()
@@ -90,12 +90,12 @@ class Case:
                     f"Expected tensors keys must be the same for all test_subjects. Got {self.test_subjects[i].tensor_keys()} for key={i} and {self.test_subjects[0].tensor_keys()}"
                 )
-    def tensor_keys(self, i: int = 0) -> typing.List[str]:
+    def tensor_keys(self, i: int = 0) -> List[str]:
         return set(self.test_subjects[i].tensor_keys())
 class BaseBatchedPenalizerTest(unittest.TestCase):
-    Penalizer: typing.Type[_BatchedPenalizer]
+    Penalizer: Type[_BatchedPenalizer]
     device = "cuda"
     vocab_size = 5
@@ -115,7 +115,7 @@ class BaseBatchedPenalizerTest(unittest.TestCase):
         """
         return torch.tensor(data, **kwargs, device=self.device)
-    def create_test_subjects(self) -> typing.List[Subject]:
+    def create_test_subjects(self) -> List[Subject]:
         raise NotImplementedError()
     def create_test_cases(self):
@@ -127,7 +127,7 @@ class BaseBatchedPenalizerTest(unittest.TestCase):
     def _create_penalizer(
         self, case: Case
-    ) -> typing.Tuple[BatchedPenalizerOrchestrator, _BatchedPenalizer]:
+    ) -> Tuple[BatchedPenalizerOrchestrator, _BatchedPenalizer]:
         orchestrator = BatchedPenalizerOrchestrator(
             vocab_size=self.vocab_size,
             batch=_BatchLike(reqs=[subject.to_req() for subject in case.test_subjects]),
@@ -287,22 +287,24 @@ class BaseBatchedPenalizerTest(unittest.TestCase):
                         if i < len(subject.steps)
                     ]
-                    inputs: typing.List[typing.List[int]] = []
-                    outputs: typing.List[typing.List[int]] = []
+                    inputs: List[List[int]] = []
+                    outputs: List[List[int]] = []
                     for subject in filtered_subjects:
                         step = subject.steps[i]
                         if step.type == StepType.INPUT:
-                            inputs.append(step.token_ids)
-                            outputs.append([])
+                            raise NotImplementedError()
                         else:
                             inputs.append([])
                             outputs.append(step.token_ids)
-                    if any(inputs):
-                        orchestrator.cumulate_input_tokens(inputs)
                     if any(outputs):
-                        orchestrator.cumulate_output_tokens(outputs)
+                        for j in range(max(len(x) for x in outputs)):
+                            tmp_outputs = torch.tensor(
+                                [x[j] for x in outputs],
+                                dtype=torch.int32,
+                                device=orchestrator.device,
+                            )
+                            orchestrator.cumulate_output_tokens(tmp_outputs)
                     if penalizer.is_required():
                         self.assertTrue(penalizer.is_prepared())

sglang 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl

sglang 0.3.5.post2py3-none-any.whl → 0.3.6.post1py3-none-any.whl