PyPI - sglang - Versions diffs - 0.1.26__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

sglang 0.1.26py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

sglang/srt/managers/controller/model_runner.py +6 -48
sglang/srt/server.py +1 -4
sglang/srt/utils.py +50 -1
sglang/version.py +1 -1
{sglang-0.1.26.dist-info → sglang-0.2.0.dist-info}/METADATA +2 -6
{sglang-0.1.26.dist-info → sglang-0.2.0.dist-info}/RECORD +9 -9
{sglang-0.1.26.dist-info → sglang-0.2.0.dist-info}/LICENSE +0 -0
{sglang-0.1.26.dist-info → sglang-0.2.0.dist-info}/WHEEL +0 -0
{sglang-0.1.26.dist-info → sglang-0.2.0.dist-info}/top_level.txt +0 -0

sglang/srt/managers/controller/model_runner.py CHANGED Viewed

@@ -15,7 +15,6 @@ from flashinfer import (
     BatchPrefillWithRaggedKVCacheWrapper,
 )
 from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
-from torch.nn.parameter import Parameter
 from vllm.config import DeviceConfig, LoadConfig
 from vllm.config import ModelConfig as VllmModelConfig
 from vllm.distributed import (
@@ -23,7 +22,6 @@ from vllm.distributed import (
     init_distributed_environment,
     initialize_model_parallel,
 )
-from vllm.model_executor.layers.linear import QKVParallelLinear
 from vllm.model_executor.models import ModelRegistry
 from sglang.global_config import global_config
@@ -32,26 +30,16 @@ from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import (
     get_available_gpu_memory,
+    is_llama3_405b_fp8,
     is_multimodal_model,
     monkey_patch_vllm_dummy_weight_loader,
     monkey_patch_vllm_p2p_access_check,
+    monkey_patch_vllm_qvk_linear_loader,
 )
 logger = logging.getLogger("srt.model_runner")
-def is_llama3_405b_fp8(model_config):
-    if (
-        model_config.hf_config.architectures[0] == "LlamaForCausalLM"
-        and model_config.hf_config.hidden_size == 16384
-        and model_config.hf_config.intermediate_size == 53248
-        and model_config.hf_config.num_hidden_layers == 126
-        and model_config.hf_config.quantization_config["quant_method"] == "fbgemm_fp8"
-    ):
-        return True
-    return False
 class ModelRunner:
     def __init__(
         self,
@@ -132,9 +120,13 @@ class ModelRunner:
             seed=42,
             skip_tokenizer_init=True,
         )
         if is_llama3_405b_fp8(self.model_config):
+            # A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints
             self.model_config.hf_config.num_key_value_heads = 8
             vllm_model_config.hf_config.num_key_value_heads = 8
+            monkey_patch_vllm_qvk_linear_loader()
         self.dtype = vllm_model_config.dtype
         if self.model_config.model_overide_args is not None:
             vllm_model_config.hf_config.update(self.model_config.model_overide_args)
@@ -387,39 +379,5 @@ def load_model_cls_srt(model_arch: str) -> Optional[Type[nn.Module]]:
     return model_arch_name_to_cls[model_arch]
-def get_original_weight(loaded_weight, head_dim):
-    n_kv_head = loaded_weight.shape[0] // (2 * head_dim)
-    dim = loaded_weight.shape[1]
-    for i in range(n_kv_head):
-        loaded_weight[i * head_dim : (i + 1) * head_dim, :] = loaded_weight[
-            2 * i * head_dim : (2 * i + 1) * head_dim, :
-        ]
-    original_kv_weight = loaded_weight[: n_kv_head * head_dim, :]
-    assert original_kv_weight.shape == (n_kv_head * head_dim, dim)
-    return original_kv_weight
-def get_weight_loader_srt(weight_loader):
-    def weight_loader_srt(
-        self,
-        param: Parameter,
-        loaded_weight: torch.Tensor,
-        loaded_shard_id: Optional[str] = None,
-    ):
-        if (
-            loaded_shard_id in ["k", "v"]
-            and loaded_weight.shape[0] == self.head_size * self.total_num_kv_heads * 2
-        ):
-            loaded_weight = get_original_weight(loaded_weight, self.head_size)
-        weight_loader(self, param, loaded_weight, loaded_shard_id)
-    return weight_loader_srt
 # Monkey patch model loader
 setattr(ModelRegistry, "load_model_cls", load_model_cls_srt)
-original_weight_loader = QKVParallelLinear.weight_loader
-setattr(
-    QKVParallelLinear, "weight_loader", get_weight_loader_srt(original_weight_loader)
-)

sglang/srt/server.py CHANGED Viewed

@@ -202,15 +202,12 @@ def launch_server(
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
         )
-    if server_args.tp_size // server_args.dp_size > 1:
+    if server_args.tp_size * server_args.dp_size > 1:
         # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
         maybe_set_triton_cache_manager()
     if server_args.chat_template:
         # TODO: replace this with huggingface transformers template
         load_chat_template_for_openai_api(server_args.chat_template)
     if server_args.enable_torch_compile:
         _set_torch_compile_config()

sglang/srt/utils.py CHANGED Viewed

@@ -21,6 +21,7 @@ import torch.distributed as dist
 from fastapi.responses import JSONResponse
 from packaging import version as pkg_version
 from starlette.middleware.base import BaseHTTPMiddleware
+from torch.nn.parameter import Parameter
 from triton.runtime.cache import (
     FileCacheManager,
     default_cache_dir,
@@ -471,7 +472,7 @@ def maybe_set_triton_cache_manager() -> None:
     cache_manger = os.environ.get("TRITON_CACHE_MANAGER", None)
     if cache_manger is None:
         manager = "sglang.srt.utils:CustomCacheManager"
-        logger.info("Setting Triton cache manager to: %s", manager)
+        logger.debug("Setting Triton cache manager to: %s", manager)
         os.environ["TRITON_CACHE_MANAGER"] = manager
@@ -615,3 +616,51 @@ def set_ulimit(target_soft_limit=65535):
             resource.setrlimit(resource_type, (target_soft_limit, current_hard))
         except ValueError as e:
             logger.warn(f"Fail to set RLIMIT_NOFILE: {e}")
+def is_llama3_405b_fp8(model_config):
+    """Return whether the model is meta-llama/Meta-Llama-3.1-405B-FP8 with 16 kv heads."""
+    if (
+        model_config.hf_config.architectures[0] == "LlamaForCausalLM"
+        and model_config.hf_config.hidden_size == 16384
+        and model_config.hf_config.intermediate_size == 53248
+        and model_config.hf_config.num_hidden_layers == 126
+        and model_config.hf_config.num_key_value_heads == 16
+        and model_config.hf_config.quantization_config["quant_method"] == "fbgemm_fp8"
+    ):
+        return True
+    return False
+def monkey_patch_vllm_qvk_linear_loader():
+    """A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints."""
+    from vllm.model_executor.layers.linear import QKVParallelLinear
+    origin_weight_loader = QKVParallelLinear.weight_loader
+    def get_original_weight(loaded_weight, head_dim):
+        n_kv_head = loaded_weight.shape[0] // (2 * head_dim)
+        dim = loaded_weight.shape[1]
+        for i in range(n_kv_head):
+            loaded_weight[i * head_dim : (i + 1) * head_dim, :] = loaded_weight[
+                2 * i * head_dim : (2 * i + 1) * head_dim, :
+            ]
+        original_kv_weight = loaded_weight[: n_kv_head * head_dim, :]
+        assert original_kv_weight.shape == (n_kv_head * head_dim, dim)
+        return original_kv_weight
+    def weight_loader_srt(
+        self,
+        param: Parameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: Optional[str] = None,
+    ):
+        if (
+            loaded_shard_id in ["k", "v"]
+            and loaded_weight.shape[0] == self.head_size * self.total_num_kv_heads * 2
+        ):
+            loaded_weight = get_original_weight(loaded_weight, self.head_size)
+        origin_weight_loader(self, param, loaded_weight, loaded_shard_id)
+    setattr(QKVParallelLinear, "weight_loader", weight_loader_srt)

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.26"
1	+ __version__ = "0.2.0"

{sglang-0.1.26.dist-info → sglang-0.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.1.26
+Version: 0.2.0
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License: Apache License
                                    Version 2.0, January 2004
@@ -315,11 +315,6 @@ docker run --gpus all \
 ```
 ### Common Notes
-- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
-```
-pip uninstall -y triton triton-nightly
-pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
-```
 - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
 - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
@@ -402,6 +397,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ```
 - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
 - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
+- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
 ### Supported Models

{sglang-0.1.26.dist-info → sglang-0.2.0.dist-info}/RECORD RENAMED Viewed

@@ -7,7 +7,7 @@ sglang/global_config.py,sha256=QG-ABVJksKK_llvUx7fSZcmK4GGCs-hBUVcM4LCr7Nw,1749
 sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
 sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
 sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
-sglang/version.py,sha256=3_QdGLpuk_SDY7k9PpNcHpSTjlPdhadPiEgF82wzkqk,23
+sglang/version.py,sha256=Zn1KFblwuFHiDRdRAiRnDBRkbPttWh44jKa5zG2ov0E,22
 sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
 sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
@@ -28,9 +28,9 @@ sglang/srt/memory_pool.py,sha256=FhJk5GtYortO3MJIsMMQ-o49agwDHVX1aEQH2LITq6c,394
 sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
 sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
 sglang/srt/sampling_params.py,sha256=OI11asr1Bd_E5soDjih614v4flgWxdMZU9HAF0aBafQ,3062
-sglang/srt/server.py,sha256=DXhcJt0V24a7yhydP1abPrK1qqV3qt7r8cyOMVOAI4M,14611
+sglang/srt/server.py,sha256=IUed6vnXCx7-xbrpEMAaJZ_aa4UubPAQ5pXvcv-xNoY,14607
 sglang/srt/server_args.py,sha256=aF6L35mEB-FU3BL_ooKuCIcOXLhYLxA9-MjpaOTQRCo,13189
-sglang/srt/utils.py,sha256=bUp3SLzbDms0dvuETaccDPAGRHOIGW5A61pqH62XiT0,20370
+sglang/srt/utils.py,sha256=DZtYSTvtSf_HWZjKZyo8TFiXahz-JfeujJcKBuBkhpQ,22318
 sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
 sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
 sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
@@ -51,7 +51,7 @@ sglang/srt/managers/controller/cuda_graph_runner.py,sha256=0aRqA1_34oJ557Zn8PjpJ
 sglang/srt/managers/controller/infer_batch.py,sha256=SKwCwhnZ_CNlG0mVCEc4X0e4HNjJFke-c8zdWP3TzjQ,34186
 sglang/srt/managers/controller/manager_multi.py,sha256=DT8Y9RF5OyTxlrLEZYz4claNWir3UrVztdOZaVPiA6g,6077
 sglang/srt/managers/controller/manager_single.py,sha256=2xO_iWK6tWvc0B31nKbe2N3klxwQBJmPTnFhNjzhVSI,4566
-sglang/srt/managers/controller/model_runner.py,sha256=FwZ7FU7nhJsYhtoTNxYFc4e6oMEwSqOh8ohXOKtFPKc,15828
+sglang/srt/managers/controller/model_runner.py,sha256=WzbyGkMnULuDkZ_SUe-UfOH2OZEQ-IE8aYYdQacy7fM,14349
 sglang/srt/managers/controller/radix_cache.py,sha256=tx8LEQpqLxipw9UUVj4D1YQLMMDmWnjDYv8oDlOl-co,8210
 sglang/srt/managers/controller/schedule_heuristic.py,sha256=SQAGzPS3aB_TPj7rnPBhewwyR6W1sVwW4D3zG3JUY00,2714
 sglang/srt/managers/controller/tp_worker.py,sha256=yjz-Xzl0zEy4QSU-EYneZH5vi3oHtBuXTtYe4VuDp2g,30517
@@ -85,8 +85,8 @@ sglang/test/test_conversation.py,sha256=gF_AyOxQgpPQBPnA57-kq-M0p_zFu-rBDMFgAq65
 sglang/test/test_openai_protocol.py,sha256=DVx3r6hrb8oRqbo5AYIleldxbqMBTtb-gtORM6t_Y1c,1661
 sglang/test/test_programs.py,sha256=uefeHUFKT2NJESOujj-CsnPXdw1aQQN2TzUbPCHJjGs,13654
 sglang/test/test_utils.py,sha256=kD_fQe3WroZ9Kc3NBRKPiZOFJ_JD2uEE9XIvPp6AD9Y,11048
-sglang-0.1.26.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-sglang-0.1.26.dist-info/METADATA,sha256=QnzTK6blFTHKTDw9ULRpaJVvXyg0MuzkdqwYkk0zPb0,30986
-sglang-0.1.26.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
-sglang-0.1.26.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
-sglang-0.1.26.dist-info/RECORD,,
+sglang-0.2.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sglang-0.2.0.dist-info/METADATA,sha256=mk2lWkWZKtTJFXM7e_z2dMdke8WiV67X9aL48lGLRaw,30791
+sglang-0.2.0.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
+sglang-0.2.0.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
+sglang-0.2.0.dist-info/RECORD,,

{sglang-0.1.26.dist-info → sglang-0.2.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{sglang-0.1.26.dist-info → sglang-0.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{sglang-0.1.26.dist-info → sglang-0.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

sglang 0.1.26__py3-none-any.whl → 0.2.0__py3-none-any.whl

sglang 0.1.26py3-none-any.whl → 0.2.0py3-none-any.whl