PyPI - sglang - Versions diffs - 0.1.24__tar.gz → 0.1.26__tar.gz - Mend

sglang 0.1.24tar.gz → 0.1.26tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

{sglang-0.1.24/sglang.egg-info → sglang-0.1.26}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.1.24
+Version: 0.1.26
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -244,7 +244,7 @@ Requires-Dist: sglang[anthropic]; extra == "all"
 Requires-Dist: sglang[litellm]; extra == "all"
 <div align="center">
-<img src="assets/logo.png" alt="logo" width="400"></img>
+<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
 </div>
 --------------------------------------------------------------------------------
@@ -282,7 +282,7 @@ The core features include:
 ### Method 1: With pip
 ```
-pip install --upgrade pip setuptools wheel
+pip install --upgrade pip
 pip install "sglang[all]"
 # Install FlashInfer CUDA kernels
@@ -405,7 +405,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ### Supported Models
-- Llama / Llama 2 / Llama 3
+- Llama / Llama 2 / Llama 3 / Llama 3.1
 - Mistral / Mixtral
 - Gemma / Gemma 2
 - Qwen / Qwen 2 / Qwen 2 MoE

{sglang-0.1.24 → sglang-0.1.26}/README.md RENAMED Viewed

@@ -1,5 +1,5 @@
 <div align="center">
-<img src="assets/logo.png" alt="logo" width="400"></img>
+<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
 </div>
 --------------------------------------------------------------------------------
@@ -37,7 +37,7 @@ The core features include:
 ### Method 1: With pip
 ```
-pip install --upgrade pip setuptools wheel
+pip install --upgrade pip
 pip install "sglang[all]"
 # Install FlashInfer CUDA kernels
@@ -160,7 +160,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ### Supported Models
-- Llama / Llama 2 / Llama 3
+- Llama / Llama 2 / Llama 3 / Llama 3.1
 - Mistral / Mixtral
 - Gemma / Gemma 2
 - Qwen / Qwen 2 / Qwen 2 MoE

{sglang-0.1.24 → sglang-0.1.26}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.1.24"
+version = "0.1.26"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"

{sglang-0.1.24 → sglang-0.1.26}/sglang/__init__.py RENAMED Viewed

@@ -1,5 +1,3 @@
-__version__ = "0.1.24"
 # SGL API Components
 from sglang.api import (
     Runtime,
@@ -32,6 +30,8 @@ from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.lang.backend.vertexai import VertexAI
+from .version import __version__
 # public APIs management
 __all__ = [
     "global_config",

{sglang-0.1.24 → sglang-0.1.26}/sglang/srt/managers/controller/model_runner.py RENAMED Viewed

@@ -15,6 +15,7 @@ from flashinfer import (
     BatchPrefillWithRaggedKVCacheWrapper,
 )
 from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
+from torch.nn.parameter import Parameter
 from vllm.config import DeviceConfig, LoadConfig
 from vllm.config import ModelConfig as VllmModelConfig
 from vllm.distributed import (
@@ -22,6 +23,7 @@ from vllm.distributed import (
     init_distributed_environment,
     initialize_model_parallel,
 )
+from vllm.model_executor.layers.linear import QKVParallelLinear
 from vllm.model_executor.models import ModelRegistry
 from sglang.global_config import global_config
@@ -38,6 +40,18 @@ from sglang.srt.utils import (
 logger = logging.getLogger("srt.model_runner")
+def is_llama3_405b_fp8(model_config):
+    if (
+        model_config.hf_config.architectures[0] == "LlamaForCausalLM"
+        and model_config.hf_config.hidden_size == 16384
+        and model_config.hf_config.intermediate_size == 53248
+        and model_config.hf_config.num_hidden_layers == 126
+        and model_config.hf_config.quantization_config["quant_method"] == "fbgemm_fp8"
+    ):
+        return True
+    return False
 class ModelRunner:
     def __init__(
         self,
@@ -118,6 +132,9 @@ class ModelRunner:
             seed=42,
             skip_tokenizer_init=True,
         )
+        if is_llama3_405b_fp8(self.model_config):
+            self.model_config.hf_config.num_key_value_heads = 8
+            vllm_model_config.hf_config.num_key_value_heads = 8
         self.dtype = vllm_model_config.dtype
         if self.model_config.model_overide_args is not None:
             vllm_model_config.hf_config.update(self.model_config.model_overide_args)
@@ -370,5 +387,39 @@ def load_model_cls_srt(model_arch: str) -> Optional[Type[nn.Module]]:
     return model_arch_name_to_cls[model_arch]
+def get_original_weight(loaded_weight, head_dim):
+    n_kv_head = loaded_weight.shape[0] // (2 * head_dim)
+    dim = loaded_weight.shape[1]
+    for i in range(n_kv_head):
+        loaded_weight[i * head_dim : (i + 1) * head_dim, :] = loaded_weight[
+            2 * i * head_dim : (2 * i + 1) * head_dim, :
+        ]
+    original_kv_weight = loaded_weight[: n_kv_head * head_dim, :]
+    assert original_kv_weight.shape == (n_kv_head * head_dim, dim)
+    return original_kv_weight
+def get_weight_loader_srt(weight_loader):
+    def weight_loader_srt(
+        self,
+        param: Parameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: Optional[str] = None,
+    ):
+        if (
+            loaded_shard_id in ["k", "v"]
+            and loaded_weight.shape[0] == self.head_size * self.total_num_kv_heads * 2
+        ):
+            loaded_weight = get_original_weight(loaded_weight, self.head_size)
+        weight_loader(self, param, loaded_weight, loaded_shard_id)
+    return weight_loader_srt
 # Monkey patch model loader
 setattr(ModelRegistry, "load_model_cls", load_model_cls_srt)
+original_weight_loader = QKVParallelLinear.weight_loader
+setattr(
+    QKVParallelLinear, "weight_loader", get_weight_loader_srt(original_weight_loader)
+)

{sglang-0.1.24 → sglang-0.1.26}/sglang/srt/server.py RENAMED Viewed

@@ -52,6 +52,7 @@ from sglang.srt.utils import (
     allocate_init_ports,
     assert_pkg_version,
     enable_show_time_cost,
+    maybe_set_triton_cache_manager,
     set_ulimit,
 )
 from sglang.utils import get_exception_traceback
@@ -201,6 +202,11 @@ def launch_server(
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
         )
+    if server_args.tp_size // server_args.dp_size > 1:
+        # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
+        maybe_set_triton_cache_manager()
     if server_args.chat_template:
         # TODO: replace this with huggingface transformers template
         load_chat_template_for_openai_api(server_args.chat_template)

{sglang-0.1.24 → sglang-0.1.26}/sglang/srt/utils.py RENAMED Viewed

@@ -18,10 +18,15 @@ import psutil
 import requests
 import torch
 import torch.distributed as dist
-import triton
 from fastapi.responses import JSONResponse
 from packaging import version as pkg_version
 from starlette.middleware.base import BaseHTTPMiddleware
+from triton.runtime.cache import (
+    FileCacheManager,
+    default_cache_dir,
+    default_dump_dir,
+    default_override_dir,
+)
 logger = logging.getLogger(__name__)
@@ -460,6 +465,44 @@ def monkey_patch_vllm_all_gather(reverse: bool = False):
         setattr(GroupCoordinator, "all_gather", all_gather)
+def maybe_set_triton_cache_manager() -> None:
+    """Set environment variable to tell Triton to use a
+    custom cache manager"""
+    cache_manger = os.environ.get("TRITON_CACHE_MANAGER", None)
+    if cache_manger is None:
+        manager = "sglang.srt.utils:CustomCacheManager"
+        logger.info("Setting Triton cache manager to: %s", manager)
+        os.environ["TRITON_CACHE_MANAGER"] = manager
+class CustomCacheManager(FileCacheManager):
+    # Adapted from: https://github.com/tdoublep/vllm/blob/3307522289fdfefe323b6c00d0db696651989a2f/vllm/triton_utils/custom_cache_manager.py
+    def __init__(self, key, override=False, dump=False):
+        self.key = key
+        self.lock_path = None
+        if dump:
+            self.cache_dir = default_dump_dir()
+            self.cache_dir = os.path.join(self.cache_dir, self.key)
+            self.lock_path = os.path.join(self.cache_dir, "lock")
+            os.makedirs(self.cache_dir, exist_ok=True)
+        elif override:
+            self.cache_dir = default_override_dir()
+            self.cache_dir = os.path.join(self.cache_dir, self.key)
+        else:
+            # create cache directory if it doesn't exist
+            self.cache_dir = (
+                os.getenv("TRITON_CACHE_DIR", "").strip() or default_cache_dir()
+            )
+            if self.cache_dir:
+                self.cache_dir = f"{self.cache_dir}_{os.getpid()}"
+                self.cache_dir = os.path.join(self.cache_dir, self.key)
+                self.lock_path = os.path.join(self.cache_dir, "lock")
+                os.makedirs(self.cache_dir, exist_ok=True)
+            else:
+                raise RuntimeError("Could not create or locate cache dir")
 API_KEY_HEADER_NAME = "X-API-Key"

sglang-0.1.26/sglang/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.26"

{sglang-0.1.24 → sglang-0.1.26/sglang.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.1.24
+Version: 0.1.26
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -244,7 +244,7 @@ Requires-Dist: sglang[anthropic]; extra == "all"
 Requires-Dist: sglang[litellm]; extra == "all"
 <div align="center">
-<img src="assets/logo.png" alt="logo" width="400"></img>
+<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
 </div>
 --------------------------------------------------------------------------------
@@ -282,7 +282,7 @@ The core features include:
 ### Method 1: With pip
 ```
-pip install --upgrade pip setuptools wheel
+pip install --upgrade pip
 pip install "sglang[all]"
 # Install FlashInfer CUDA kernels
@@ -405,7 +405,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ### Supported Models
-- Llama / Llama 2 / Llama 3
+- Llama / Llama 2 / Llama 3 / Llama 3.1
 - Mistral / Mixtral
 - Gemma / Gemma 2
 - Qwen / Qwen 2 / Qwen 2 MoE

{sglang-0.1.24 → sglang-0.1.26}/sglang.egg-info/SOURCES.txt RENAMED Viewed

@@ -3,7 +3,6 @@ README.md
 pyproject.toml
 sglang/__init__.py
 sglang/api.py
-sglang/bench.py
 sglang/bench_latency.py
 sglang/bench_serving.py
 sglang/check_env.py
@@ -11,18 +10,12 @@ sglang/global_config.py
 sglang/launch_server.py
 sglang/launch_server_llavavid.py
 sglang/utils.py
+sglang/version.py
 sglang.egg-info/PKG-INFO
 sglang.egg-info/SOURCES.txt
 sglang.egg-info/dependency_links.txt
 sglang.egg-info/requires.txt
 sglang.egg-info/top_level.txt
-sglang/backend/__init__.py
-sglang/backend/anthropic.py
-sglang/backend/base_backend.py
-sglang/backend/litellm.py
-sglang/backend/openai.py
-sglang/backend/runtime_endpoint.py
-sglang/backend/vertexai.py
 sglang/lang/__init__.py
 sglang/lang/chat_template.py
 sglang/lang/compiler.py
@@ -42,8 +35,6 @@ sglang/srt/hf_transformers_utils.py
 sglang/srt/memory_pool.py
 sglang/srt/mm_utils.py
 sglang/srt/model_config.py
-sglang/srt/openai_api_adapter.py
-sglang/srt/openai_protocol.py
 sglang/srt/sampling_params.py
 sglang/srt/server.py
 sglang/srt/server_args.py
@@ -65,7 +56,6 @@ sglang/srt/managers/detokenizer_manager.py
 sglang/srt/managers/io_struct.py
 sglang/srt/managers/tokenizer_manager.py
 sglang/srt/managers/controller/cuda_graph_runner.py
-sglang/srt/managers/controller/dp_worker.py
 sglang/srt/managers/controller/infer_batch.py
 sglang/srt/managers/controller/manager_multi.py
 sglang/srt/managers/controller/manager_single.py
@@ -98,9 +88,6 @@ sglang/srt/models/qwen2_moe.py
 sglang/srt/models/stablelm.py
 sglang/srt/models/yivl.py
 sglang/srt/openai_api/adapter.py
-sglang/srt/openai_api/api_adapter.py
-sglang/srt/openai_api/openai_api_adapter.py
-sglang/srt/openai_api/openai_protocol.py
 sglang/srt/openai_api/protocol.py
 sglang/test/test_conversation.py
 sglang/test/test_openai_protocol.py

sglang-0.1.24/sglang/backend/anthropic.py DELETED Viewed

@@ -1,77 +0,0 @@
-from typing import List, Optional, Union
-import numpy as np
-from sglang.backend.base_backend import BaseBackend
-from sglang.lang.chat_template import get_chat_template
-from sglang.lang.interpreter import StreamExecutor
-from sglang.lang.ir import SglSamplingParams
-try:
-    import anthropic
-except ImportError as e:
-    anthropic = e
-class Anthropic(BaseBackend):
-    def __init__(self, model_name, *args, **kwargs):
-        super().__init__()
-        if isinstance(anthropic, Exception):
-            raise anthropic
-        self.model_name = model_name
-        self.chat_template = get_chat_template("claude")
-        self.client = anthropic.Anthropic(*args, **kwargs)
-    def get_chat_template(self):
-        return self.chat_template
-    def generate(
-        self,
-        s: StreamExecutor,
-        sampling_params: SglSamplingParams,
-    ):
-        if s.messages_:
-            messages = s.messages_
-        else:
-            messages = [{"role": "user", "content": s.text_}]
-        if messages and messages[0]["role"] == "system":
-            system = messages.pop(0)["content"]
-        else:
-            system = ""
-        ret = self.client.messages.create(
-            model=self.model_name,
-            system=system,
-            messages=messages,
-            **sampling_params.to_anthropic_kwargs(),
-        )
-        comp = ret.content[0].text
-        return comp, {}
-    def generate_stream(
-        self,
-        s: StreamExecutor,
-        sampling_params: SglSamplingParams,
-    ):
-        if s.messages_:
-            messages = s.messages_
-        else:
-            messages = [{"role": "user", "content": s.text_}]
-        if messages and messages[0]["role"] == "system":
-            system = messages.pop(0)["content"]
-        else:
-            system = ""
-        with self.client.messages.stream(
-            model=self.model_name,
-            system=system,
-            messages=messages,
-            **sampling_params.to_anthropic_kwargs(),
-        ) as stream:
-            for text in stream.text_stream:
-                yield text, {}

sglang-0.1.24/sglang/backend/litellm.py DELETED Viewed

@@ -1,90 +0,0 @@
-from typing import Mapping, Optional
-from sglang.backend.base_backend import BaseBackend
-from sglang.lang.chat_template import get_chat_template_by_model_path
-from sglang.lang.interpreter import StreamExecutor
-from sglang.lang.ir import SglSamplingParams
-try:
-    import litellm
-except ImportError as e:
-    litellm = e
-    litellm.num_retries = 1
-class LiteLLM(BaseBackend):
-    def __init__(
-        self,
-        model_name,
-        chat_template=None,
-        api_key=None,
-        organization: Optional[str] = None,
-        base_url: Optional[str] = None,
-        timeout: Optional[float] = 600,
-        max_retries: Optional[int] = litellm.num_retries,
-        default_headers: Optional[Mapping[str, str]] = None,
-    ):
-        super().__init__()
-        if isinstance(litellm, Exception):
-            raise litellm
-        self.model_name = model_name
-        self.chat_template = chat_template or get_chat_template_by_model_path(
-            model_name
-        )
-        self.client_params = {
-            "api_key": api_key,
-            "organization": organization,
-            "base_url": base_url,
-            "timeout": timeout,
-            "max_retries": max_retries,
-            "default_headers": default_headers,
-        }
-    def get_chat_template(self):
-        return self.chat_template
-    def generate(
-        self,
-        s: StreamExecutor,
-        sampling_params: SglSamplingParams,
-    ):
-        if s.messages_:
-            messages = s.messages_
-        else:
-            messages = [{"role": "user", "content": s.text_}]
-        ret = litellm.completion(
-            model=self.model_name,
-            messages=messages,
-            **self.client_params,
-            **sampling_params.to_anthropic_kwargs(),
-        )
-        comp = ret.choices[0].message.content
-        return comp, {}
-    def generate_stream(
-        self,
-        s: StreamExecutor,
-        sampling_params: SglSamplingParams,
-    ):
-        if s.messages_:
-            messages = s.messages_
-        else:
-            messages = [{"role": "user", "content": s.text_}]
-        ret = litellm.completion(
-            model=self.model_name,
-            messages=messages,
-            stream=True,
-            **self.client_params,
-            **sampling_params.to_litellm_kwargs(),
-        )
-        for chunk in ret:
-            text = chunk.choices[0].delta.content
-            if text is not None:
-                yield text, {}

sglang 0.1.24__tar.gz → 0.1.26__tar.gz

sglang 0.1.24tar.gz → 0.1.26tar.gz