PyPI - sglang - Versions diffs - 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

sglang 0.1.14py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

sglang/__init__.py +55 -2
sglang/api.py +3 -5
sglang/backend/anthropic.py +18 -4
sglang/backend/openai.py +2 -1
sglang/backend/runtime_endpoint.py +18 -5
sglang/backend/vertexai.py +1 -0
sglang/global_config.py +1 -0
sglang/lang/chat_template.py +74 -0
sglang/lang/interpreter.py +40 -16
sglang/lang/tracer.py +6 -4
sglang/launch_server.py +2 -1
sglang/srt/constrained/fsm_cache.py +1 -0
sglang/srt/constrained/jump_forward.py +1 -0
sglang/srt/conversation.py +2 -2
sglang/srt/hf_transformers_utils.py +2 -1
sglang/srt/layers/context_flashattention_nopad.py +1 -0
sglang/srt/layers/extend_attention.py +1 -0
sglang/srt/layers/logits_processor.py +114 -54
sglang/srt/layers/radix_attention.py +2 -1
sglang/srt/layers/token_attention.py +1 -0
sglang/srt/managers/detokenizer_manager.py +5 -1
sglang/srt/managers/io_struct.py +12 -0
sglang/srt/managers/router/infer_batch.py +70 -33
sglang/srt/managers/router/manager.py +7 -2
sglang/srt/managers/router/model_rpc.py +116 -73
sglang/srt/managers/router/model_runner.py +111 -167
sglang/srt/managers/router/radix_cache.py +46 -38
sglang/srt/managers/tokenizer_manager.py +56 -11
sglang/srt/memory_pool.py +5 -14
sglang/srt/model_config.py +7 -0
sglang/srt/models/commandr.py +376 -0
sglang/srt/models/dbrx.py +413 -0
sglang/srt/models/dbrx_config.py +281 -0
sglang/srt/models/gemma.py +22 -20
sglang/srt/models/llama2.py +23 -21
sglang/srt/models/llava.py +12 -10
sglang/srt/models/mixtral.py +27 -25
sglang/srt/models/qwen.py +23 -21
sglang/srt/models/qwen2.py +23 -21
sglang/srt/models/stablelm.py +20 -21
sglang/srt/models/yivl.py +6 -5
sglang/srt/openai_api_adapter.py +356 -0
sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +36 -20
sglang/srt/sampling_params.py +2 -0
sglang/srt/server.py +68 -447
sglang/srt/server_args.py +76 -49
sglang/srt/utils.py +88 -32
sglang/srt/weight_utils.py +402 -0
sglang/test/test_programs.py +8 -7
sglang/test/test_utils.py +195 -7
{sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/METADATA +12 -14
sglang-0.1.15.dist-info/RECORD +69 -0
sglang-0.1.14.dist-info/RECORD +0 -64
{sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/LICENSE +0 -0
{sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/WHEEL +0 -0
{sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/top_level.txt +0 -0

sglang/srt/weight_utils.py ADDED Viewed

@@ -0,0 +1,402 @@
+# The PR(https://github.com/vllm-project/vllm/pull/4097) of vllm borken the sglang code.
+# In order to adapt to the latest code without modifying too much code,
+# copied the previous vllm/model_executor/weight_utils.py
+# Copied in https://github.com/vllm-project/vllm/blob/05434764cd99990035779cf9a4ed86623b528825/vllm/model_executor/weight_utils.py
+"""Utilities for downloading and initializing model weights."""
+import fnmatch
+import glob
+import hashlib
+import json
+import os
+from collections import defaultdict
+from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
+import filelock
+import huggingface_hub.constants
+import numpy as np
+import torch
+from huggingface_hub import HfFileSystem, snapshot_download
+from safetensors.torch import load_file, safe_open, save_file
+from tqdm.auto import tqdm
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization import (QuantizationConfig,
+                                                     get_quantization_config)
+from vllm.model_executor.layers.quantization.schema import QuantParamSchema
+logger = init_logger(__name__)
+# use system-level temp directory for file locks, so that multiple users
+# can share the same lock without error.
+# lock files in the temp directory will be automatically deleted when the
+# system reboots, so users will not complain about annoying lock files
+temp_dir = os.environ.get('TMPDIR') or os.environ.get(
+    'TEMP') or os.environ.get('TMP') or "/tmp/"
+def enable_hf_transfer():
+    """automatically activates hf_transfer
+    """
+    if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
+        try:
+            # enable hf hub transfer if available
+            import hf_transfer  # type: ignore # noqa
+            huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
+        except ImportError:
+            pass
+enable_hf_transfer()
+class Disabledtqdm(tqdm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, disable=True)
+def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
+    lock_dir = cache_dir or temp_dir
+    os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
+    model_name = model_name_or_path.replace("/", "-")
+    hash_name = hashlib.sha256(model_name.encode()).hexdigest()
+    # add hash to avoid conflict with old users' lock files
+    lock_file_name = hash_name + model_name + ".lock"
+    # mode 0o666 is required for the filelock to be shared across users
+    lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name),
+                             mode=0o666)
+    return lock
+def _shared_pointers(tensors):
+    ptrs = defaultdict(list)
+    for k, v in tensors.items():
+        ptrs[v.data_ptr()].append(k)
+    failing = []
+    for _, names in ptrs.items():
+        if len(names) > 1:
+            failing.append(names)
+    return failing
+def convert_bin_to_safetensor_file(
+    pt_filename: str,
+    sf_filename: str,
+) -> None:
+    loaded = torch.load(pt_filename, map_location="cpu")
+    if "state_dict" in loaded:
+        loaded = loaded["state_dict"]
+    shared = _shared_pointers(loaded)
+    for shared_weights in shared:
+        for name in shared_weights[1:]:
+            loaded.pop(name)
+    # For tensors to be contiguous
+    loaded = {k: v.contiguous() for k, v in loaded.items()}
+    dirname = os.path.dirname(sf_filename)
+    os.makedirs(dirname, exist_ok=True)
+    save_file(loaded, sf_filename, metadata={"format": "pt"})
+    # check file size
+    sf_size = os.stat(sf_filename).st_size
+    pt_size = os.stat(pt_filename).st_size
+    if (sf_size - pt_size) / pt_size > 0.01:
+        raise RuntimeError(f"""The file size different is more than 1%:
+         - {sf_filename}: {sf_size}
+         - {pt_filename}: {pt_size}
+         """)
+    # check if the tensors are the same
+    reloaded = load_file(sf_filename)
+    for k in loaded:
+        pt_tensor = loaded[k]
+        sf_tensor = reloaded[k]
+        if not torch.equal(pt_tensor, sf_tensor):
+            raise RuntimeError(f"The output tensors do not match for key {k}")
+# TODO(woosuk): Move this to other place.
+def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
+    quant_cls = get_quantization_config(model_config.quantization)
+    # Read the quantization config from the HF model config, if available.
+    hf_quant_config = getattr(model_config.hf_config, "quantization_config",
+                              None)
+    if hf_quant_config is not None:
+        return quant_cls.from_config(hf_quant_config)
+    model_name_or_path = model_config.model
+    is_local = os.path.isdir(model_name_or_path)
+    if not is_local:
+        # Download the config files.
+        with get_lock(model_name_or_path, model_config.download_dir):
+            hf_folder = snapshot_download(model_name_or_path,
+                                          revision=model_config.revision,
+                                          allow_patterns="*.json",
+                                          cache_dir=model_config.download_dir,
+                                          tqdm_class=Disabledtqdm)
+    else:
+        hf_folder = model_name_or_path
+    config_files = glob.glob(os.path.join(hf_folder, "*.json"))
+    quant_config_files = [
+        f for f in config_files if any(
+            f.endswith(x) for x in quant_cls.get_config_filenames())
+    ]
+    if len(quant_config_files) == 0:
+        raise ValueError(
+            f"Cannot find the config file for {model_config.quantization}")
+    if len(quant_config_files) > 1:
+        raise ValueError(
+            f"Found multiple config files for {model_config.quantization}: "
+            f"{quant_config_files}")
+    quant_config_file = quant_config_files[0]
+    with open(quant_config_file, "r") as f:
+        config = json.load(f)
+    return quant_cls.from_config(config)
+def prepare_hf_model_weights(
+    model_name_or_path: str,
+    cache_dir: Optional[str] = None,
+    load_format: str = "auto",
+    fall_back_to_pt: bool = True,
+    revision: Optional[str] = None,
+) -> Tuple[str, List[str], bool]:
+    # Download model weights from huggingface.
+    is_local = os.path.isdir(model_name_or_path) \
+               and load_format != "tensorizer"
+    use_safetensors = False
+    # Some quantized models use .pt files for storing the weights.
+    if load_format == "auto":
+        allow_patterns = ["*.safetensors", "*.bin"]
+    elif load_format == "safetensors":
+        use_safetensors = True
+        allow_patterns = ["*.safetensors"]
+    elif load_format == "pt":
+        allow_patterns = ["*.pt"]
+    elif load_format == "npcache":
+        allow_patterns = ["*.bin"]
+    elif load_format == "tensorizer":
+        allow_patterns = ["*.tensors"]
+    else:
+        raise ValueError(f"Unknown load_format: {load_format}")
+    if fall_back_to_pt:
+        allow_patterns += ["*.pt"]
+    if not is_local and load_format != "tensorizer":
+        # Before we download we look at that is available:
+        fs = HfFileSystem()
+        file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
+        # depending on what is available we download different things
+        for pattern in allow_patterns:
+            matching = fnmatch.filter(file_list, pattern)
+            if len(matching) > 0:
+                allow_patterns = [pattern]
+                break
+        logger.info(f"Using model weights format {allow_patterns}")
+        # Use file lock to prevent multiple processes from
+        # downloading the same model weights at the same time.
+        with get_lock(model_name_or_path, cache_dir):
+            hf_folder = snapshot_download(model_name_or_path,
+                                          allow_patterns=allow_patterns,
+                                          cache_dir=cache_dir,
+                                          tqdm_class=Disabledtqdm,
+                                          revision=revision)
+    else:
+        hf_folder = model_name_or_path
+    hf_weights_files: List[str] = []
+    for pattern in allow_patterns:
+        hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
+        if len(hf_weights_files) > 0:
+            if pattern == "*.safetensors":
+                use_safetensors = True
+            break
+    if not use_safetensors:
+        # Exclude files that are not needed for inference.
+        # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
+        blacklist = [
+            "training_args.bin",
+            "optimizer.bin",
+            "optimizer.pt",
+            "scheduler.pt",
+            "scaler.pt",
+        ]
+        hf_weights_files = [
+            f for f in hf_weights_files
+            if not any(f.endswith(x) for x in blacklist)
+        ]
+    if load_format == "tensorizer":
+        return hf_folder, hf_weights_files, use_safetensors
+    if len(hf_weights_files) == 0:
+        raise RuntimeError(
+            f"Cannot find any model weights with `{model_name_or_path}`")
+    return hf_folder, hf_weights_files, use_safetensors
+def hf_model_weights_iterator(
+    model_name_or_path: str,
+    cache_dir: Optional[str] = None,
+    load_format: Union[Tuple, str] = "auto",
+    revision: Optional[str] = None,
+    fall_back_to_pt: Optional[bool] = True,
+) -> Iterator[Tuple[str, torch.Tensor]]:
+    hf_folder, hf_weights_files, use_safetensors = prepare_hf_model_weights(
+        model_name_or_path,
+        cache_dir=cache_dir,
+        load_format=load_format,
+        fall_back_to_pt=fall_back_to_pt,
+        revision=revision)
+    if load_format == "npcache":
+        # Currently np_cache only support *.bin checkpoints
+        assert use_safetensors is False
+        # Convert the model weights from torch tensors to numpy arrays for
+        # faster loading.
+        np_folder = os.path.join(hf_folder, "np")
+        os.makedirs(np_folder, exist_ok=True)
+        weight_names_file = os.path.join(np_folder, "weight_names.json")
+        # Use file lock to prevent multiple processes from
+        # dumping the same model weights to numpy at the same time.
+        with get_lock(model_name_or_path, cache_dir):
+            if not os.path.exists(weight_names_file):
+                weight_names = []
+                for bin_file in hf_weights_files:
+                    state = torch.load(bin_file, map_location="cpu")
+                    for name, param in state.items():
+                        param_path = os.path.join(np_folder, name)
+                        with open(param_path, "wb") as f:
+                            np.save(f, param.cpu().detach().numpy())
+                        weight_names.append(name)
+                with open(weight_names_file, "w") as f:
+                    json.dump(weight_names, f)
+        with open(weight_names_file, "r") as f:
+            weight_names = json.load(f)
+        for name in weight_names:
+            param_path = os.path.join(np_folder, name)
+            with open(param_path, "rb") as f:
+                param = np.load(f)
+            yield name, torch.from_numpy(param)
+    elif load_format == "tensorizer":
+        from vllm.model_executor.tensorizer_loader import (TensorDeserializer,
+                                                           open_stream,
+                                                           tensorizer_warning)
+        tensorizer_args = load_format.params
+        tensorizer_warning(
+            "Deserializing HuggingFace models is not optimized for "
+            "loading on vLLM, as tensorizer is forced to load to CPU. "
+            "Consider deserializing a vLLM model instead for faster "
+            "load times. See the examples/tensorize_vllm_model.py example "
+            "script for serializing vLLM models.")
+        deserializer_args = tensorizer_args.deserializer_params
+        stream_params = tensorizer_args.stream_params
+        stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params)
+        with TensorDeserializer(stream, **deserializer_args,
+                                device="cpu") as state:
+            for name, param in state.items():
+                yield name, param
+        del state
+    elif use_safetensors:
+        for st_file in hf_weights_files:
+            with safe_open(st_file, framework="pt") as f:
+                for name in f.keys():  # noqa: SIM118
+                    param = f.get_tensor(name)
+                    yield name, param
+    else:
+        for bin_file in hf_weights_files:
+            state = torch.load(bin_file, map_location="cpu")
+            for name, param in state.items():
+                yield name, param
+            del state
+            torch.cuda.empty_cache()
+def kv_cache_scales_loader(
+        filename: str, tp_rank: int, tp_size: int, num_hidden_layers: int,
+        model_type: Optional[str]) -> Iterable[Tuple[int, float]]:
+    """
+    A simple utility to read in KV cache scaling factors that have been
+    previously serialized to disk. Used by the model to populate the appropriate
+    KV cache scaling factors. The serialization should represent a dictionary
+    whose keys are the TP ranks and values are another dictionary mapping layers
+    to their KV cache scaling factors.
+    Keep this function in sync with the output of examples/fp8/extract_scales.py
+    """
+    try:
+        with open(filename) as f:
+            context = {
+                "model_type": model_type,
+                "num_hidden_layers": num_hidden_layers,
+                "tp_rank": tp_rank,
+                "tp_size": tp_size,
+            }
+            schema_dct = json.load(f)
+            schema = QuantParamSchema.model_validate(schema_dct,
+                                                     context=context)
+            layer_scales_map = schema.kv_cache.scaling_factor[tp_rank]
+            return layer_scales_map.items()
+    except FileNotFoundError:
+        logger.error(f"File or directory '{filename}' not found.")
+    except json.JSONDecodeError:
+        logger.error(f"Error decoding JSON in file '{filename}'.")
+    except Exception as e:
+        logger.error(f"An error occurred while reading '{filename}': {e}")
+    # This section is reached if and only if any of the excepts are hit
+    # Return an empty iterable (list) => no KV cache scales are loaded
+    # which ultimately defaults to 1.0 scales
+    logger.warning("Defaulting to KV cache scaling factors = 1.0 "
+                   f"for all layers in TP rank {tp_rank} "
+                   "as an error occurred during loading.")
+    return []
+def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
+    """convert PySafeSlice object from safetensors to torch.Tensor
+    PySafeSlice object supports indexing, which is done before loading the
+    actual tensor and can reduce the amount of memory being read into the
+    memory. However, it does not support more advanced functionalities
+    like `.view()` or `.t()`. Therefore, if we need to modify the loaded
+    tensor with these more complicated operators, we need to convert to
+    tensor first.
+    """
+    if not isinstance(x, torch.Tensor):
+        x = x[:]
+    return x
+def default_weight_loader(param: torch.Tensor,
+                          loaded_weight: torch.Tensor) -> None:
+    """Default weight loader."""
+    assert param.size() == loaded_weight.size()
+    param.data.copy_(loaded_weight)
+def initialize_dummy_weights(
+    model: torch.nn.Module,
+    low: float = -1e-3,
+    high: float = 1e-3,
+) -> None:
+    """Initialize model weights with random values.
+    The model weights must be randomly initialized for accurate performance
+    measurements. Additionally, the model weights should not cause NaNs in the
+    forward pass. We empirically found that initializing the weights with
+    values between -1e-3 and 1e-3 works well for most models.
+    """
+    for param in model.state_dict().values():
+        if torch.is_floating_point(param):
+            param.data.uniform_(low, high)

sglang/test/test_programs.py CHANGED Viewed

@@ -226,7 +226,7 @@ Action 3: Finish [United States].\n
 def test_parallel_decoding():
     max_tokens = 64
-    number = 5
+    fork_size = 5
     @sgl.function
     def parallel_decoding(s, topic):
@@ -234,17 +234,17 @@ def test_parallel_decoding():
         s += "USER: Give some tips for " + topic + ".\n"
         s += (
             "ASSISTANT: Okay. Here are "
-            + str(number)
+            + str(fork_size)
             + " concise tips, each under 8 words:\n"
         )
         # Generate skeleton
-        for i in range(1, 1 + number):
+        for i in range(1, 1 + fork_size):
             s += f"{i}." + sgl.gen(max_tokens=16, stop=[".", "\n"]) + ".\n"
         # Generate detailed tips
-        forks = s.fork(number)
-        for i in range(number):
+        forks = s.fork(fork_size)
+        for i in range(fork_size):
             forks[
                 i
             ] += f"Now, I expand tip {i+1} into a detailed paragraph:\nTip {i+1}:"
@@ -253,7 +253,7 @@ def test_parallel_decoding():
         # Concatenate tips and summarize
         s += "Here are these tips with detailed explanation:\n"
-        for i in range(number):
+        for i in range(fork_size):
             s += f"Tip {i+1}:" + forks[i]["detailed_tip"] + "\n"
         s += "\nIn summary," + sgl.gen("summary", max_tokens=512)
@@ -296,7 +296,7 @@ def test_parallel_encoding(check_answer=True):
 def test_image_qa():
     @sgl.function
     def image_qa(s, question):
-        s += sgl.user(sgl.image("test_image.png") + question)
+        s += sgl.user(sgl.image("example_image.png") + question)
         s += sgl.assistant(sgl.gen("answer"))
     state = image_qa.run(
@@ -313,6 +313,7 @@ def test_image_qa():
 def test_stream():
     @sgl.function
     def qa(s, question):
+        s += sgl.system("You are a helpful assistant.")
         s += sgl.user(question)
         s += sgl.assistant(sgl.gen("answer"))

sglang 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

sglang 0.1.14py3-none-any.whl → 0.1.15py3-none-any.whl