PyPI - sglang - Versions diffs - 0.3.6.post3__py3-none-any.whl → 0.4.0.post1__py3-none-any.whl - Mend

sglang 0.3.6.post3py3-none-any.whl → 0.4.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

sglang/__init__.py +1 -1
sglang/bench_one_batch.py +4 -0
sglang/bench_serving.py +13 -0
sglang/check_env.py +1 -1
sglang/srt/_custom_ops.py +118 -0
sglang/srt/configs/device_config.py +17 -0
sglang/srt/configs/load_config.py +84 -0
sglang/srt/configs/model_config.py +161 -4
sglang/srt/configs/qwen2vl.py +5 -8
sglang/srt/constrained/outlines_backend.py +11 -1
sglang/srt/constrained/outlines_jump_forward.py +8 -1
sglang/srt/constrained/xgrammar_backend.py +5 -5
sglang/srt/distributed/__init__.py +3 -0
sglang/srt/distributed/communication_op.py +34 -0
sglang/srt/distributed/device_communicators/__init__.py +0 -0
sglang/srt/distributed/device_communicators/cuda_wrapper.py +182 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +352 -0
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +291 -0
sglang/srt/distributed/device_communicators/hpu_communicator.py +48 -0
sglang/srt/distributed/device_communicators/pynccl.py +204 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +362 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +568 -0
sglang/srt/distributed/device_communicators/xpu_communicator.py +47 -0
sglang/srt/distributed/parallel_state.py +1275 -0
sglang/srt/distributed/utils.py +223 -0
sglang/srt/hf_transformers_utils.py +37 -1
sglang/srt/layers/attention/__init__.py +5 -2
sglang/srt/layers/attention/double_sparsity_backend.py +22 -8
sglang/srt/layers/attention/flashinfer_backend.py +33 -20
sglang/srt/layers/attention/torch_native_backend.py +299 -0
sglang/srt/layers/attention/triton_backend.py +22 -8
sglang/srt/layers/attention/triton_ops/extend_attention.py +3 -0
sglang/srt/layers/ep_moe/__init__.py +0 -0
sglang/srt/layers/ep_moe/kernels.py +349 -0
sglang/srt/layers/ep_moe/layer.py +661 -0
sglang/srt/layers/fused_moe_patch.py +20 -11
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/logits_processor.py +17 -3
sglang/srt/layers/quantization/__init__.py +36 -2
sglang/srt/layers/quantization/fp8.py +559 -0
sglang/srt/layers/quantization/fp8_utils.py +27 -0
sglang/srt/layers/radix_attention.py +4 -2
sglang/srt/layers/sampler.py +2 -0
sglang/srt/layers/torchao_utils.py +23 -45
sglang/srt/layers/vocab_parallel_embedding.py +1 -0
sglang/srt/lora/lora.py +1 -1
sglang/srt/managers/io_struct.py +48 -2
sglang/srt/managers/schedule_batch.py +19 -14
sglang/srt/managers/schedule_policy.py +7 -4
sglang/srt/managers/scheduler.py +145 -85
sglang/srt/managers/tokenizer_manager.py +166 -68
sglang/srt/managers/tp_worker.py +36 -3
sglang/srt/managers/tp_worker_overlap_thread.py +28 -8
sglang/srt/mem_cache/memory_pool.py +5 -1
sglang/srt/model_executor/cuda_graph_runner.py +30 -7
sglang/srt/model_executor/forward_batch_info.py +9 -4
sglang/srt/model_executor/model_runner.py +146 -153
sglang/srt/model_loader/__init__.py +34 -0
sglang/srt/model_loader/loader.py +1139 -0
sglang/srt/model_loader/utils.py +41 -0
sglang/srt/model_loader/weight_utils.py +640 -0
sglang/srt/model_parallel.py +1 -5
sglang/srt/models/baichuan.py +9 -10
sglang/srt/models/chatglm.py +6 -15
sglang/srt/models/commandr.py +4 -5
sglang/srt/models/dbrx.py +2 -3
sglang/srt/models/deepseek.py +4 -11
sglang/srt/models/deepseek_v2.py +90 -18
sglang/srt/models/exaone.py +2 -3
sglang/srt/models/gemma.py +2 -6
sglang/srt/models/gemma2.py +3 -14
sglang/srt/models/gemma2_reward.py +0 -1
sglang/srt/models/gpt2.py +5 -12
sglang/srt/models/gpt_bigcode.py +6 -22
sglang/srt/models/grok.py +3 -8
sglang/srt/models/internlm2.py +2 -3
sglang/srt/models/internlm2_reward.py +0 -1
sglang/srt/models/llama.py +96 -31
sglang/srt/models/llama_classification.py +1 -2
sglang/srt/models/llama_embedding.py +1 -2
sglang/srt/models/llama_reward.py +2 -3
sglang/srt/models/llava.py +1 -4
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +4 -7
sglang/srt/models/minicpm3.py +6 -19
sglang/srt/models/mixtral.py +24 -14
sglang/srt/models/mixtral_quant.py +2 -3
sglang/srt/models/mllama.py +3 -7
sglang/srt/models/olmo.py +2 -8
sglang/srt/models/olmo2.py +0 -1
sglang/srt/models/olmoe.py +3 -5
sglang/srt/models/phi3_small.py +8 -13
sglang/srt/models/qwen.py +2 -3
sglang/srt/models/qwen2.py +10 -9
sglang/srt/models/qwen2_moe.py +4 -16
sglang/srt/models/qwen2_vl.py +2 -6
sglang/srt/models/registry.py +99 -0
sglang/srt/models/stablelm.py +2 -3
sglang/srt/models/torch_native_llama.py +6 -17
sglang/srt/models/xverse.py +2 -4
sglang/srt/models/xverse_moe.py +4 -11
sglang/srt/models/yivl.py +2 -3
sglang/srt/openai_api/adapter.py +9 -5
sglang/srt/openai_api/protocol.py +1 -0
sglang/srt/sampling/sampling_batch_info.py +9 -8
sglang/srt/server.py +270 -173
sglang/srt/server_args.py +102 -29
sglang/srt/utils.py +295 -28
sglang/test/test_utils.py +7 -0
sglang/version.py +1 -1
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.post1.dist-info}/METADATA +5 -4
sglang-0.4.0.post1.dist-info/RECORD +189 -0
sglang-0.3.6.post3.dist-info/RECORD +0 -162
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.post1.dist-info}/LICENSE +0 -0
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.post1.dist-info}/WHEEL +0 -0
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.post1.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -66,7 +66,7 @@ from sglang.version import __version__
 __all__ += ["__version__"]
-# SGL Backends
+# SGLang Backends
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.utils import LazyImport

sglang/bench_one_batch.py CHANGED Viewed

@@ -111,8 +111,12 @@ def load_model(server_args, port_args, tp_rank):
     model_config = ModelConfig(
         server_args.model_path,
         trust_remote_code=server_args.trust_remote_code,
+        revision=server_args.revision,
         context_length=server_args.context_length,
         model_override_args=server_args.json_model_override_args,
+        is_embedding=server_args.is_embedding,
+        dtype=server_args.dtype,
+        quantization=server_args.quantization,
     )
     model_runner = ModelRunner(
         model_config=model_config,

sglang/bench_serving.py CHANGED Viewed

@@ -51,6 +51,7 @@ class RequestFuncInput:
     prompt_len: int
     output_len: int
     model: str
+    lora_name: str
     extra_request_body: Dict[str, Any]
@@ -319,6 +320,7 @@ async def async_request_sglang_generate(
                 "ignore_eos": not args.disable_ignore_eos,
             },
             "stream": not args.disable_stream,
+            "lora_path": request_func_input.lora_name,
             **request_func_input.extra_request_body,
         }
         headers = {}
@@ -884,6 +886,7 @@ async def benchmark(
     request_rate: float,
     max_concurrency: Optional[int],
     disable_tqdm: bool,
+    lora_name: str,
     extra_request_body: Dict[str, Any],
     profile: bool,
 ):
@@ -909,6 +912,7 @@ async def benchmark(
         api_url=api_url,
         prompt_len=test_prompt_len,
         output_len=test_output_len,
+        lora_name=lora_name,
         extra_request_body=extra_request_body,
     )
     test_output = await request_func(request_func_input=test_input)
@@ -942,6 +946,7 @@ async def benchmark(
             api_url=api_url,
             prompt_len=prompt_len,
             output_len=output_len,
+            lora_name=lora_name,
             extra_request_body=extra_request_body,
         )
         tasks.append(
@@ -1247,6 +1252,7 @@ def run_benchmark(args_: argparse.Namespace):
                 request_rate=args.request_rate,
                 max_concurrency=args.max_concurrency,
                 disable_tqdm=args.disable_tqdm,
+                lora_name=args.lora_name,
                 extra_request_body=extra_request_body,
                 profile=args.profile,
             )
@@ -1267,6 +1273,7 @@ def run_benchmark(args_: argparse.Namespace):
                     request_rate=rate,
                     max_concurrency=args.max_concurrency,
                     disable_tqdm=args.disable_tqdm,
+                    lora_name=args.lora_name,
                     extra_request_body=extra_request_body,
                     profile=args.profile,
                 )
@@ -1451,5 +1458,11 @@ if __name__ == "__main__":
         help="Use Torch Profiler. The endpoint must be launched with "
         "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
     )
+    parser.add_argument(
+        "--lora-name",
+        type=str,
+        default=None,
+        help="The name of LoRA adapter",
+    )
     args = parser.parse_args()
     run_benchmark(args)

sglang/check_env.py CHANGED Viewed

@@ -9,7 +9,7 @@ from collections import OrderedDict, defaultdict
 import torch
-# List of packages to check versions for
+# List of packages to check versions
 PACKAGE_LIST = [
     "sglang",
     "flashinfer",

sglang/srt/_custom_ops.py ADDED Viewed

@@ -0,0 +1,118 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/_custom_ops.py
+import contextlib
+import functools
+import importlib
+import logging
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+import torch
+import torch.library
+from sglang.srt.utils import is_hpu
+logger = logging.getLogger(__name__)
+if not is_hpu():
+    try:
+        import custom_ar
+    except ImportError as e:
+        logger.warning("Failed to import from custom_ar with %r", e)
+def hint_on_error(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            return fn(*args, **kwargs)
+        except NotImplementedError as e:
+            msg = (
+                "Error in calling custom op %s: %s\n"
+                "Not implemented or built, mostly likely because the current current device "
+                "does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set "
+                "incorrectly while building)"
+            )
+            logger.error(msg, fn.__name__, e)
+            raise NotImplementedError(msg % (fn.__name__, e)) from e
+        except AttributeError as e:
+            msg = (
+                "Error in calling custom op %s: %s\n"
+                "Possibly you have built or installed an obsolete version of vllm.\n"
+                "Please try a clean build and install of vllm,"
+                "or remove old built files such as vllm/*cpython*.so and build/ ."
+            )
+            logger.error(msg, fn.__name__, e)
+            raise e
+    return wrapper
+# custom ar
+def init_custom_ar(
+    ipc_tensors: List[torch.Tensor],
+    rank_data: torch.Tensor,
+    rank: int,
+    full_nvlink: bool,
+) -> int:
+    return torch.ops._C_vllm_ar.init_custom_ar(
+        ipc_tensors, rank_data, rank, full_nvlink
+    )
+def all_reduce(
+    fa: int,
+    inp: torch.Tensor,
+    out: torch.Tensor,
+    reg_buffer: int,
+    reg_buffer_sz_bytes: int,
+) -> None:
+    torch.ops._C_vllm_ar.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
+def dispose(fa: int) -> None:
+    torch.ops._C_vllm_ar.dispose(fa)
+def meta_size() -> int:
+    return torch.ops._C_vllm_ar.meta_size()
+def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
+    return torch.ops._C_vllm_ar.register_buffer(fa, ipc_tensors)
+def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
+    return torch.ops._C_vllm_ar.get_graph_buffer_ipc_meta(fa)
+def register_graph_buffers(
+    fa: int, handles: List[List[int]], offsets: List[List[int]]
+) -> None:
+    torch.ops._C_vllm_ar.register_graph_buffers(fa, handles, offsets)
+# temporary fix for https://github.com/vllm-project/vllm/issues/5456
+# TODO: remove this in v0.6.0
+names_and_values = globals()
+names_and_values_to_update = {}
+# prepare variables to avoid dict size change during iteration
+k, v, arg = None, None, None
+fn_type = type(lambda x: x)
+for k, v in names_and_values.items():
+    # find functions that are defined in this file and have torch.Tensor
+    # in their annotations. `arg == "torch.Tensor"` is used to handle
+    # the case when users use `import __annotations__` to turn type
+    # hints into strings.
+    if (
+        isinstance(v, fn_type)
+        and v.__code__.co_filename == __file__
+        and any(
+            arg is torch.Tensor or arg == "torch.Tensor"
+            for arg in v.__annotations__.values()
+        )
+    ):
+        names_and_values_to_update[k] = hint_on_error(v)
+names_and_values.update(names_and_values_to_update)
+del names_and_values_to_update, names_and_values, v, k, fn_type

sglang/srt/configs/device_config.py ADDED Viewed

@@ -0,0 +1,17 @@
+import logging
+from typing import Optional
+import torch
+logger = logging.getLogger(__name__)
+class DeviceConfig:
+    device: Optional[torch.device]
+    def __init__(self, device: str = "cuda") -> None:
+        if device in ["cuda", "xpu", "hpu"]:
+            self.device_type = device
+        else:
+            raise RuntimeError(f"Not supported device type: {device}")
+        self.device = torch.device(self.device_type)

sglang/srt/configs/load_config.py ADDED Viewed

@@ -0,0 +1,84 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+import enum
+import json
+import logging
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+from sglang.srt.utils import is_hip
+logger = logging.getLogger(__name__)
+class LoadFormat(str, enum.Enum):
+    AUTO = "auto"
+    PT = "pt"
+    SAFETENSORS = "safetensors"
+    NPCACHE = "npcache"
+    DUMMY = "dummy"
+    SHARDED_STATE = "sharded_state"
+    GGUF = "gguf"
+    BITSANDBYTES = "bitsandbytes"
+    MISTRAL = "mistral"
+@dataclass
+class LoadConfig:
+    """
+    download_dir: Directory to download and load the weights, default to the
+        default cache directory of huggingface.
+    load_format: The format of the model weights to load:
+        "auto" will try to load the weights in the safetensors format and
+            fall back to the pytorch bin format if safetensors format is
+            not available.
+        "pt" will load the weights in the pytorch bin format.
+        "safetensors" will load the weights in the safetensors format.
+        "npcache" will load the weights in pytorch format and store
+            a numpy cache to speed up the loading.
+        "dummy" will initialize the weights with random values, which is
+            mainly for profiling.
+        "bitsandbytes" will load nf4 type weights.
+    ignore_patterns: The list of patterns to ignore when loading the model.
+        Default to "original/**/*" to avoid repeated loading of llama's
+        checkpoints.
+    """
+    load_format: Union[str, LoadFormat] = LoadFormat.AUTO
+    download_dir: Optional[str] = None
+    model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict)
+    ignore_patterns: Optional[Union[List[str], str]] = None
+    def __post_init__(self):
+        model_loader_extra_config = self.model_loader_extra_config or {}
+        if isinstance(model_loader_extra_config, str):
+            self.model_loader_extra_config = json.loads(model_loader_extra_config)
+        self._verify_load_format()
+        if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
+            logger.info(
+                "Ignoring the following patterns when downloading weights: %s",
+                self.ignore_patterns,
+            )
+        else:
+            self.ignore_patterns = ["original/**/*"]
+    def _verify_load_format(self) -> None:
+        if not isinstance(self.load_format, str):
+            return
+        load_format = self.load_format.lower()
+        self.load_format = LoadFormat(load_format)
+        rocm_not_supported_load_format: List[str] = []
+        if is_hip() and load_format in rocm_not_supported_load_format:
+            rocm_supported_load_format = [
+                f
+                for f in LoadFormat.__members__
+                if (f not in rocm_not_supported_load_format)
+            ]
+            raise ValueError(
+                f"load format '{load_format}' is not supported in ROCm. "
+                f"Supported load formats are "
+                f"{rocm_supported_load_format}"
+            )

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -15,12 +15,14 @@
 import json
 import logging
 from enum import IntEnum, auto
-from typing import List, Optional
+from typing import List, Optional, Union
+import torch
 from transformers import PretrainedConfig
 from sglang.srt.hf_transformers_utils import get_config, get_context_length
-from sglang.srt.utils import get_bool_env_var
+from sglang.srt.layers.quantization import QUANTIZATION_METHODS
+from sglang.srt.utils import get_bool_env_var, is_hip
 logger = logging.getLogger(__name__)
@@ -33,17 +35,22 @@ class AttentionArch(IntEnum):
 class ModelConfig:
     def __init__(
         self,
-        path: str,
+        model_path: str,
         trust_remote_code: bool = True,
         revision: Optional[str] = None,
         context_length: Optional[int] = None,
         model_override_args: Optional[dict] = None,
         is_embedding: Optional[bool] = None,
+        dtype: str = "auto",
+        quantization: Optional[str] = None,
     ) -> None:
+        self.model_path = model_path
+        self.revision = revision
+        self.quantization = quantization
         # Parse args
         self.model_override_args = json.loads(model_override_args)
         self.hf_config = get_config(
-            path,
+            model_path,
             trust_remote_code=trust_remote_code,
             revision=revision,
             model_override_args=self.model_override_args,
@@ -56,6 +63,7 @@ class ModelConfig:
         )
         self.is_multimodal = is_multimodal_model(self.hf_config.architectures)
         self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
+        self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
         # Derive context length
         derived_context_len = get_context_length(self.hf_text_config)
@@ -116,6 +124,8 @@ class ModelConfig:
         self.num_hidden_layers = self.hf_text_config.num_hidden_layers
         self.vocab_size = self.hf_text_config.vocab_size
+        self._verify_quantization()
     # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
     def get_total_num_kv_heads(self) -> int:
         """Returns the total number of KV heads."""
@@ -174,6 +184,86 @@ class ModelConfig:
         # parallel size so each GPU has at least one KV head.
         return max(1, total_num_kv_heads // tensor_parallel_size)
+    # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+    def _parse_quant_hf_config(self):
+        quant_cfg = getattr(self.hf_config, "quantization_config", None)
+        if quant_cfg is None:
+            # compressed-tensors uses a "compression_config" key
+            quant_cfg = getattr(self.hf_config, "compression_config", None)
+        return quant_cfg
+    # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+    def _verify_quantization(self) -> None:
+        supported_quantization = [*QUANTIZATION_METHODS]
+        rocm_supported_quantization = [
+            "awq",
+            "gptq",
+            "fp8",
+            "compressed_tensors",
+            "compressed-tensors",
+            "fbgemm_fp8",
+        ]
+        optimized_quantization_methods = [
+            "fp8",
+            "marlin",
+            "modelopt",
+            "gptq_marlin_24",
+            "gptq_marlin",
+            "awq_marlin",
+            "fbgemm_fp8",
+            "compressed_tensors",
+            "compressed-tensors",
+            "experts_int8",
+        ]
+        if self.quantization is not None:
+            self.quantization = self.quantization.lower()
+        # Parse quantization method from the HF model config, if available.
+        quant_cfg = self._parse_quant_hf_config()
+        if quant_cfg is not None:
+            quant_method = quant_cfg.get("quant_method", "").lower()
+            # Detect which checkpoint is it
+            for _, method in QUANTIZATION_METHODS.items():
+                quantization_override = method.override_quantization_method(
+                    quant_cfg, self.quantization
+                )
+                if quantization_override:
+                    quant_method = quantization_override
+                    self.quantization = quantization_override
+                    break
+            # Verify quantization configurations.
+            if self.quantization is None:
+                self.quantization = quant_method
+            elif self.quantization != quant_method:
+                raise ValueError(
+                    "Quantization method specified in the model config "
+                    f"({quant_method}) does not match the quantization "
+                    f"method specified in the `quantization` argument "
+                    f"({self.quantization})."
+                )
+        if self.quantization is not None:
+            if self.quantization not in supported_quantization:
+                raise ValueError(
+                    f"Unknown quantization method: {self.quantization}. Must "
+                    f"be one of {supported_quantization}."
+                )
+            if is_hip() and self.quantization not in rocm_supported_quantization:
+                raise ValueError(
+                    f"{self.quantization} quantization is currently not "
+                    f"supported in ROCm."
+                )
+            if self.quantization not in optimized_quantization_methods:
+                logger.warning(
+                    "%s quantization is not fully "
+                    "optimized yet. The speed can be slower than "
+                    "non-quantized models.",
+                    self.quantization,
+                )
 def get_hf_text_config(config: PretrainedConfig):
     """Get the "sub" config relevant to llm for multi modal models.
@@ -183,6 +273,9 @@ def get_hf_text_config(config: PretrainedConfig):
     if class_name.startswith("Llava") and class_name.endswith("ForCausalLM"):
         # We support non-hf version of llava models, so we do not want to
         # read the wrong values from the unused default text_config.
+        # NOTE(HandH1998): We set `torch_dtype` of config to `torch.float16` for the weights, as
+        # `torch.float16` is default used for image features in `python/sglang/srt/models/llava.py`.
+        setattr(config, "torch_dtype", torch.float16)
         return config
     if hasattr(config, "text_config"):
@@ -195,6 +288,70 @@ def get_hf_text_config(config: PretrainedConfig):
         return config
+# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+_STR_DTYPE_TO_TORCH_DTYPE = {
+    "half": torch.float16,
+    "float16": torch.float16,
+    "float": torch.float32,
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+}
+# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+def _get_and_verify_dtype(
+    config: PretrainedConfig,
+    dtype: Union[str, torch.dtype],
+) -> torch.dtype:
+    # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
+    # because config.torch_dtype can be None.
+    config_dtype = getattr(config, "torch_dtype", None)
+    if config_dtype is None:
+        config_dtype = torch.float32
+    if isinstance(dtype, str):
+        dtype = dtype.lower()
+        if dtype == "auto":
+            if config_dtype == torch.float32:
+                if config.model_type == "gemma2":
+                    logger.info(
+                        "For Gemma 2, we downcast float32 to bfloat16 instead "
+                        "of float16 by default. Please specify `dtype` if you "
+                        "want to use float16."
+                    )
+                    torch_dtype = torch.bfloat16
+                else:
+                    # Following the common practice, we use float16 for float32
+                    # models.
+                    torch_dtype = torch.float16
+            else:
+                torch_dtype = config_dtype
+        else:
+            if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
+                raise ValueError(f"Unknown dtype: {dtype}")
+            torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
+    elif isinstance(dtype, torch.dtype):
+        torch_dtype = dtype
+    else:
+        raise ValueError(f"Unknown dtype: {dtype}")
+    # Verify the dtype.
+    if torch_dtype != config_dtype:
+        if torch_dtype == torch.float32:
+            # Upcasting to float32 is allowed.
+            logger.info("Upcasting %s to %s.", config_dtype, torch_dtype)
+            pass
+        elif config_dtype == torch.float32:
+            # Downcasting from float32 to float16 or bfloat16 is allowed.
+            logger.info("Downcasting %s to %s.", config_dtype, torch_dtype)
+            pass
+        else:
+            # Casting between float16 and bfloat16 is allowed with a warning.
+            logger.warning("Casting %s to %s.", config_dtype, torch_dtype)
+    return torch_dtype
 def is_generation_model(model_architectures: List[str], is_embedding: bool = False):
     # We have two ways to determine whether a model is a generative model.
     # 1. Check the model architectue

sglang/srt/configs/qwen2vl.py CHANGED Viewed

@@ -121,13 +121,10 @@ class Qwen2VLConfig(PretrainedConfig):
         self.attention_dropout = attention_dropout
         self.rope_scaling = rope_scaling
-        # NOTE: the following section from original transformers config
-        # for Qwen2-VL is commented out to address rope config loading issue
-        #
-        # if self.rope_scaling is not None and "type" in self.rope_scaling:
-        #     if self.rope_scaling["type"] == "mrope":
-        #         self.rope_scaling["type"] = "default"
-        #     self.rope_scaling["rope_type"] = self.rope_scaling["type"]
-        # rope_config_validation(self)
+        # NOTE(HandH1998): This is necessary for configuring the `rope_type`` of qwen2vl models after removing dependencies on vllm.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            if self.rope_scaling["type"] == "mrope":
+                self.rope_scaling["type"] = "default"
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

sglang/srt/constrained/outlines_backend.py CHANGED Viewed

@@ -42,6 +42,7 @@ class OutlinesGrammar(BaseGrammarObject):
         self.guide = guide
         self.jump_forward_map = jump_forward_map
         self.state = 0
+        self.finished = False
     def accept_token(self, token: int):
         self.state = self.guide.get_next_state(self.state, token)
@@ -84,6 +85,10 @@ class OutlinesGrammar(BaseGrammarObject):
     ) -> torch.Tensor:
         return torch.zeros(batch_size, vocab_size, dtype=torch.bool, device=device)
+    @staticmethod
+    def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
+        return vocab_mask
     def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
         tokens = torch.tensor(
             self.guide.get_next_instruction(self.state).tokens, dtype=torch.int64
@@ -152,7 +157,12 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
             raise ValueError(f"Invalid key_type: {key_type}")
         try:
-            guide = RegexGuide(regex, self.outlines_tokenizer)
+            if hasattr(RegexGuide, "from_regex"):
+                # outlines >= 0.1.1
+                guide = RegexGuide.from_regex(regex, self.outlines_tokenizer)
+            else:
+                # outlines <= 0.0.46
+                guide = RegexGuide(regex, self.outlines_tokenizer)
         except interegular.patterns.InvalidSyntax as e:
             logger.warning(f"skip invalid regex schema: {regex=}, {e=}")
             return None

sglang/srt/constrained/outlines_jump_forward.py CHANGED Viewed

@@ -23,7 +23,14 @@ from collections import defaultdict
 import interegular
 from interegular import InvalidSyntax
 from outlines.caching import cache as disk_cache
-from outlines.fsm.regex import FSMInfo, make_byte_level_fsm, make_deterministic_fsm
+try:
+    # outlines >= 0.1.0
+    from outlines_core.fsm.outlines_core_rs import FSMInfo
+    from outlines_core.fsm.regex import make_byte_level_fsm, make_deterministic_fsm
+except ImportError:
+    # outlines <= 0.0.46
+    from outlines.fsm.regex import FSMInfo, make_byte_level_fsm, make_deterministic_fsm
 IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"

sglang/srt/constrained/xgrammar_backend.py CHANGED Viewed

@@ -45,6 +45,7 @@ class XGrammarGrammar(BaseGrammarObject):
         self.matcher = matcher
         self.vocab_size = vocab_size
         self.ctx = ctx
+        self.finished = False
     def accept_token(self, token: int):
         assert self.matcher.accept_token(token)
@@ -85,12 +86,11 @@ class XGrammarGrammar(BaseGrammarObject):
         self.matcher.fill_next_token_bitmask(vocab_mask, idx)
     @staticmethod
-    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
-        if vocab_mask.device.type != logits.device.type:
-            # vocab_mask must then be on the same device as logits
-            # when applying the token bitmask, so we check and move if needed
-            vocab_mask = vocab_mask.to(logits.device)
+    def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
+        return vocab_mask.to(device, non_blocking=True)
+    @staticmethod
+    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
         apply_token_bitmask_inplace(logits, vocab_mask)
     def copy(self):

sglang/srt/distributed/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .communication_op import *
+from .parallel_state import *
+from .utils import *

sglang/srt/distributed/communication_op.py ADDED Viewed

@@ -0,0 +1,34 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/communication_op.py
+from typing import Any, Dict, Optional, Union
+import torch
+import torch.distributed
+from .parallel_state import get_tp_group
+def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
+    """All-reduce the input tensor across model parallel group."""
+    return get_tp_group().all_reduce(input_)
+def tensor_model_parallel_all_gather(
+    input_: torch.Tensor, dim: int = -1
+) -> torch.Tensor:
+    """All-gather the input tensor across model parallel group."""
+    return get_tp_group().all_gather(input_, dim)
+def tensor_model_parallel_gather(
+    input_: torch.Tensor, dst: int = 0, dim: int = -1
+) -> Optional[torch.Tensor]:
+    """Gather the input tensor across model parallel group."""
+    return get_tp_group().gather(input_, dst, dim)
+def broadcast_tensor_dict(
+    tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0
+):
+    if not torch.distributed.is_initialized():
+        return tensor_dict
+    return get_tp_group().broadcast_tensor_dict(tensor_dict, src)

sglang/srt/distributed/device_communicators/__init__.py ADDED Viewed

File without changes

sglang 0.3.6.post3__py3-none-any.whl → 0.4.0.post1__py3-none-any.whl

sglang 0.3.6.post3py3-none-any.whl → 0.4.0.post1py3-none-any.whl