PyPI - sglang - Versions diffs - 0.1.22__py3-none-any.whl → 0.1.25__py3-none-any.whl - Mend

sglang 0.1.22py3-none-any.whl → 0.1.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

sglang/__init__.py +2 -2
sglang/bench_serving.py +243 -25
sglang/global_config.py +3 -2
sglang/lang/interpreter.py +1 -0
sglang/srt/hf_transformers_utils.py +13 -1
sglang/srt/layers/logits_processor.py +4 -5
sglang/srt/layers/radix_attention.py +38 -49
sglang/srt/managers/controller/cuda_graph_runner.py +58 -16
sglang/srt/managers/controller/infer_batch.py +51 -22
sglang/srt/managers/controller/model_runner.py +58 -4
sglang/srt/managers/controller/schedule_heuristic.py +8 -3
sglang/srt/managers/controller/tp_worker.py +9 -11
sglang/srt/memory_pool.py +13 -5
sglang/srt/models/deepseek.py +430 -0
sglang/srt/models/gpt_bigcode.py +282 -0
sglang/srt/models/llama2.py +19 -10
sglang/srt/server.py +26 -1
sglang/srt/server_args.py +12 -6
sglang/srt/utils.py +93 -1
sglang/version.py +1 -0
{sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/METADATA +10 -6
{sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/RECORD +25 -36
{sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/WHEEL +1 -1
sglang/backend/__init__.py +0 -0
sglang/backend/anthropic.py +0 -77
sglang/backend/base_backend.py +0 -80
sglang/backend/litellm.py +0 -90
sglang/backend/openai.py +0 -438
sglang/backend/runtime_endpoint.py +0 -283
sglang/backend/vertexai.py +0 -149
sglang/bench.py +0 -627
sglang/srt/managers/controller/dp_worker.py +0 -113
sglang/srt/openai_api/api_adapter.py +0 -432
sglang/srt/openai_api/openai_api_adapter.py +0 -431
sglang/srt/openai_api/openai_protocol.py +0 -207
sglang/srt/openai_api_adapter.py +0 -411
sglang/srt/openai_protocol.py +0 -207
{sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/LICENSE +0 -0
{sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/top_level.txt +0 -0

sglang/srt/models/llama2.py CHANGED Viewed

@@ -5,14 +5,10 @@
 from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
-import tqdm
 from torch import nn
 from transformers import LlamaConfig
 from vllm.config import CacheConfig
-from vllm.distributed import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
+from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
@@ -39,6 +35,7 @@ class LlamaMLP(nn.Module):
         intermediate_size: int,
         hidden_act: str,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -46,12 +43,14 @@ class LlamaMLP(nn.Module):
             [intermediate_size] * 2,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.down_proj = RowParallelLinear(
             intermediate_size,
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "silu":
             raise ValueError(
@@ -70,6 +69,7 @@ class LlamaMLP(nn.Module):
 class LlamaAttention(nn.Module):
     def __init__(
         self,
+        config: LlamaConfig,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -79,6 +79,7 @@ class LlamaAttention(nn.Module):
         rope_is_neox_style: bool = True,
         max_position_embeddings: int = 8192,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -96,7 +97,10 @@ class LlamaAttention(nn.Module):
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -110,12 +114,14 @@ class LlamaAttention(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         self.rotary_emb = get_rope(
@@ -154,6 +160,7 @@ class LlamaDecoderLayer(nn.Module):
         config: LlamaConfig,
         layer_id: int = 0,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -168,6 +175,7 @@ class LlamaDecoderLayer(nn.Module):
         rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.self_attn = LlamaAttention(
+            config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
@@ -177,12 +185,14 @@ class LlamaDecoderLayer(nn.Module):
             rope_is_neox_style=rope_is_neox_style,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
         )
         self.mlp = LlamaMLP(
             hidden_size=self.hidden_size,
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
             quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
         )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(
@@ -230,7 +240,9 @@ class LlamaModel(nn.Module):
         )
         self.layers = nn.ModuleList(
             [
-                LlamaDecoderLayer(config, i, quant_config=quant_config)
+                LlamaDecoderLayer(
+                    config, i, quant_config=quant_config, prefix=f"model.layers.{i}"
+                )
                 for i in range(config.num_hidden_layers)
             ]
         )
@@ -370,9 +382,6 @@ class LlamaForCausalLM(nn.Module):
                 weight_loader(param, loaded_weight)
         if name is None or loaded_weight is None:
-            if get_tensor_model_parallel_rank() == 0:
-                weights = tqdm.tqdm(weights, total=int(len(params_dict) * 1.5))
             for name, loaded_weight in weights:
                 load_weights_per_param(name, loaded_weight)
         else:

sglang/srt/server.py CHANGED Viewed

@@ -52,6 +52,7 @@ from sglang.srt.utils import (
     allocate_init_ports,
     assert_pkg_version,
     enable_show_time_cost,
+    maybe_set_triton_cache_manager,
     set_ulimit,
 )
 from sglang.utils import get_exception_traceback
@@ -157,6 +158,19 @@ def _set_global_server_args(server_args: ServerArgs):
     }
+def _set_torch_compile_config():
+    # The following configurations are for torch compile optimizations
+    import torch._dynamo.config
+    import torch._inductor.config
+    torch._inductor.config.coordinate_descent_tuning = True
+    torch._inductor.config.triton.unique_kernel_names = True
+    torch._inductor.config.fx_graph_cache = True  # Experimental feature to reduce compilation times, will be on by default in future
+    # FIXME: tmp workaround
+    torch._dynamo.config.accumulated_cache_size_limit = 256
 def launch_server(
     server_args: ServerArgs,
     model_overide_args: Optional[dict] = None,
@@ -174,6 +188,7 @@ def launch_server(
     os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
     os.environ["NCCL_CUMEM_ENABLE"] = "0"
     os.environ["NCCL_NVLS_ENABLE"] = "0"
+    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
     set_ulimit()
     if server_args.show_time_cost:
         enable_show_time_cost()
@@ -182,14 +197,23 @@ def launch_server(
     if not server_args.disable_flashinfer:
         assert_pkg_version(
             "flashinfer",
-            "0.1.0",
+            "0.1.1",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
         )
+    if server_args.tp_size // server_args.dp_size > 1:
+        # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
+        maybe_set_triton_cache_manager()
     if server_args.chat_template:
         # TODO: replace this with huggingface transformers template
         load_chat_template_for_openai_api(server_args.chat_template)
+    if server_args.enable_torch_compile:
+        _set_torch_compile_config()
     _set_global_server_args(server_args)
     # Allocate ports
@@ -205,6 +229,7 @@ def launch_server(
         detokenizer_port=ports[2],
         nccl_ports=ports[3:],
     )
+    logger.info(f"{server_args=}")
     # Handle multi-node tensor parallelism
     if server_args.nnodes > 1:

sglang/srt/server_args.py CHANGED Viewed

@@ -29,7 +29,7 @@ class ServerArgs:
     max_prefill_tokens: Optional[int] = None
     max_running_requests: Optional[int] = None
     schedule_heuristic: str = "lpm"
-    schedule_conservativeness: float = 0.8
+    schedule_conservativeness: float = 1.0
     # Other runtime options
     tp_size: int = 1
@@ -55,6 +55,7 @@ class ServerArgs:
     disable_regex_jump_forward: bool = False
     disable_cuda_graph: bool = False
     disable_disk_cache: bool = False
+    enable_torch_compile: bool = False
     attention_reduce_in_fp32: bool = False
     enable_p2p_check: bool = False
     efficient_weight_load: bool = False
@@ -69,15 +70,15 @@ class ServerArgs:
             self.tokenizer_path = self.model_path
         if self.mem_fraction_static is None:
             if self.tp_size >= 16:
-                self.mem_fraction_static = 0.74
+                self.mem_fraction_static = 0.80
             elif self.tp_size >= 8:
-                self.mem_fraction_static = 0.78
+                self.mem_fraction_static = 0.84
             elif self.tp_size >= 4:
-                self.mem_fraction_static = 0.82
+                self.mem_fraction_static = 0.86
             elif self.tp_size >= 2:
-                self.mem_fraction_static = 0.85
-            else:
                 self.mem_fraction_static = 0.88
+            else:
+                self.mem_fraction_static = 0.89
         if isinstance(self.additional_ports, int):
             self.additional_ports = [self.additional_ports]
         elif self.additional_ports is None:
@@ -317,6 +318,11 @@ class ServerArgs:
             action="store_true",
             help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
         )
+        parser.add_argument(
+            "--enable-torch-compile",
+            action="store_true",
+            help="Optimize the model with torch.compile, experimental feature.",
+        )
         parser.add_argument(
             "--attention-reduce-in-fp32",
             action="store_true",

sglang/srt/utils.py CHANGED Viewed

@@ -18,10 +18,15 @@ import psutil
 import requests
 import torch
 import torch.distributed as dist
-import triton
 from fastapi.responses import JSONResponse
 from packaging import version as pkg_version
 from starlette.middleware.base import BaseHTTPMiddleware
+from triton.runtime.cache import (
+    FileCacheManager,
+    default_cache_dir,
+    default_dump_dir,
+    default_override_dir,
+)
 logger = logging.getLogger(__name__)
@@ -312,6 +317,9 @@ def suppress_other_loggers():
     logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
         logging.WARN
     )
+    logging.getLogger("vllm.distributed.device_communicators.shm_broadcast").setLevel(
+        logging.WARN
+    )
     logging.getLogger("vllm.selector").setLevel(logging.WARN)
     logging.getLogger("vllm.utils").setLevel(logging.WARN)
@@ -411,6 +419,90 @@ def monkey_patch_vllm_dummy_weight_loader():
     setattr(DummyModelLoader, "load_model", load_model)
+vllm_all_gather_backup = None
+def monkey_patch_vllm_all_gather(reverse: bool = False):
+    """Monkey patch all-gather to remove in-place operations."""
+    from torch.distributed import _functional_collectives as funcol
+    from vllm.distributed.parallel_state import GroupCoordinator
+    global vllm_all_gather_backup
+    if vllm_all_gather_backup is None:
+        vllm_all_gather_backup = GroupCoordinator.all_gather
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert (
+            -input_.dim() <= dim < input_.dim()
+        ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty(
+            (world_size,) + input_size, dtype=input_.dtype, device=input_.device
+        )
+        output_tensor = funcol.all_gather_tensor(
+            input_, gather_dim=0, group=self.device_group
+        ).view((world_size,) + input_size)
+        # Reshape
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(
+            input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
+        )
+        return output_tensor
+    if reverse:
+        setattr(GroupCoordinator, "all_gather", vllm_all_gather_backup)
+    else:
+        setattr(GroupCoordinator, "all_gather", all_gather)
+def maybe_set_triton_cache_manager() -> None:
+    """Set environment variable to tell Triton to use a
+    custom cache manager"""
+    cache_manger = os.environ.get("TRITON_CACHE_MANAGER", None)
+    if cache_manger is None:
+        manager = "sglang.srt.utils:CustomCacheManager"
+        logger.info("Setting Triton cache manager to: %s", manager)
+        os.environ["TRITON_CACHE_MANAGER"] = manager
+class CustomCacheManager(FileCacheManager):
+    # Adapted from: https://github.com/tdoublep/vllm/blob/3307522289fdfefe323b6c00d0db696651989a2f/vllm/triton_utils/custom_cache_manager.py
+    def __init__(self, key, override=False, dump=False):
+        self.key = key
+        self.lock_path = None
+        if dump:
+            self.cache_dir = default_dump_dir()
+            self.cache_dir = os.path.join(self.cache_dir, self.key)
+            self.lock_path = os.path.join(self.cache_dir, "lock")
+            os.makedirs(self.cache_dir, exist_ok=True)
+        elif override:
+            self.cache_dir = default_override_dir()
+            self.cache_dir = os.path.join(self.cache_dir, self.key)
+        else:
+            # create cache directory if it doesn't exist
+            self.cache_dir = (
+                os.getenv("TRITON_CACHE_DIR", "").strip() or default_cache_dir()
+            )
+            if self.cache_dir:
+                self.cache_dir = f"{self.cache_dir}_{os.getpid()}"
+                self.cache_dir = os.path.join(self.cache_dir, self.key)
+                self.lock_path = os.path.join(self.cache_dir, "lock")
+                os.makedirs(self.cache_dir, exist_ok=True)
+            else:
+                raise RuntimeError("Could not create or locate cache dir")
 API_KEY_HEADER_NAME = "X-API-Key"

sglang/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.25"

{sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.1.22
+Version: 0.1.25
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License: Apache License
                                    Version 2.0, January 2004
@@ -240,7 +240,7 @@ Requires-Dist: torch ; extra == 'srt'
 Requires-Dist: uvicorn ; extra == 'srt'
 Requires-Dist: uvloop ; extra == 'srt'
 Requires-Dist: zmq ; extra == 'srt'
-Requires-Dist: vllm ==0.5.1 ; extra == 'srt'
+Requires-Dist: vllm ==0.5.3.post1 ; extra == 'srt'
 Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
 <div align="center">
@@ -282,6 +282,7 @@ The core features include:
 ### Method 1: With pip
 ```
+pip install --upgrade pip
 pip install "sglang[all]"
 # Install FlashInfer CUDA kernels
@@ -293,6 +294,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 git clone https://github.com/sgl-project/sglang.git
 cd sglang
+pip install --upgrade pip
 pip install -e "python[all]"
 # Install FlashInfer CUDA kernels
@@ -390,19 +392,20 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
 ```
 - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
-- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
+- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
 ```
 # Node 0
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
 # Node 1
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
 ```
 - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
+- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 ### Supported Models
-- Llama / Llama 2 / Llama 3
+- Llama / Llama 2 / Llama 3 / Llama 3.1
 - Mistral / Mixtral
 - Gemma / Gemma 2
 - Qwen / Qwen 2 / Qwen 2 MoE
@@ -420,6 +423,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - Grok
 - ChatGLM
 - InternLM 2
+- Mistral NeMo
 Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).

{sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/RECORD RENAMED Viewed

@@ -1,24 +1,17 @@
-sglang/__init__.py,sha256=7-tQgpOarxM1MfYy5nCbpqhqSKB_hKRAI4tekucmYz4,1141
+sglang/__init__.py,sha256=UV7VlXhXrwi00Zg45iNB9KcnmrwLjdMtjMz06AiafY0,1151
 sglang/api.py,sha256=1JARbc1wNYF6tODdUpgmNgTyLOvMnxdTBctLvEwzGTY,5565
-sglang/bench.py,sha256=p34wnfMRdiedOUf9GKGZkkNxehmyTzK6Q1O20q_SGjY,21841
 sglang/bench_latency.py,sha256=UPy6WhrddMTDX7HqIeHNhCn5vF0YMOKxJlQRvhMC8zU,10552
-sglang/bench_serving.py,sha256=IebHhb0AM_4FhA74Xu13QK1-KXpkRZ_k3ohwKiot9mU,26116
+sglang/bench_serving.py,sha256=zKGgVX3S-ggUvOxvEM4AszzXRPRVU6NGNnBG5vAAvRY,34577
 sglang/check_env.py,sha256=CscuPMlf68dkgZf0m-FiLpUisNNDoihMck4qhLOeV1Q,4124
-sglang/global_config.py,sha256=6WAMjRR1lDeGFdFu-18xUAbWVM2Vj0_L5ExvQ5wofus,1711
+sglang/global_config.py,sha256=QG-ABVJksKK_llvUx7fSZcmK4GGCs-hBUVcM4LCr7Nw,1749
 sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
 sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
 sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
-sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
-sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
-sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
-sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
-sglang/backend/runtime_endpoint.py,sha256=PAdnQBj3yQNtgw8GH9F1ecGE7HhxGa2T7Tz_c--H2aE,9203
-sglang/backend/vertexai.py,sha256=98toR-L0OTi4dYHaSmmzJdlQ2qN_0lImoKZFlVgYLRE,4850
+sglang/version.py,sha256=Ej7LsXg-6CASlaEHsZkUoLDpYEfHeFKdIeXMIM0esgA,23
 sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
 sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
-sglang/lang/interpreter.py,sha256=0phpQs4PooVvVJCzzyNrTv2OFevI5fsU1FcN4roxqhY,29628
+sglang/lang/interpreter.py,sha256=27j7H9p7TY4uUfF9f5E17FxK1xCNeNju4aut_PaWCrQ,29693
 sglang/lang/ir.py,sha256=5VVK2JnbspdysrhcGgkmp_JlAprd2XqqRnS_GfP_XWc,16645
 sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
 sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -30,16 +23,14 @@ sglang/lang/backend/runtime_endpoint.py,sha256=TZ0NV89or5_3MIZZFnc1JXAAjnv7tCfeQ
 sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
 sglang/srt/conversation.py,sha256=Il7JJuu4o42k2xdBWVfONNmstTsAM-4idX6AcEOnrXQ,15526
 sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
-sglang/srt/hf_transformers_utils.py,sha256=H3YnLtx05q65A1tn1JWNZOUhMtq6jANRhhMo6JJr6mg,10728
-sglang/srt/memory_pool.py,sha256=rzJq9-kgO9ON5mgHcLT5GKiQCWBCFaczPE8-9M6ckaU,3680
+sglang/srt/hf_transformers_utils.py,sha256=94mOI93B2xOmXKqfJfEoGxqHgwwlWNbPHgsA47AQJK8,11245
+sglang/srt/memory_pool.py,sha256=FhJk5GtYortO3MJIsMMQ-o49agwDHVX1aEQH2LITq6c,3949
 sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
 sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
-sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
-sglang/srt/openai_protocol.py,sha256=lGBhfxG6jmgUkMOh2NpBK9w9TUTRZKrsfHdW7XYhKKI,5700
 sglang/srt/sampling_params.py,sha256=OI11asr1Bd_E5soDjih614v4flgWxdMZU9HAF0aBafQ,3062
-sglang/srt/server.py,sha256=c0Ldp-10tvTroJI0msHWorrqObR90FuNK6SM4KP-qeU,13682
-sglang/srt/server_args.py,sha256=6pMKJN0S1QoTcVAstmxc5Laub2OAxMYpMykQky-Ym10,12959
-sglang/srt/utils.py,sha256=GFO0K-BnpAGi1_Cp4cSKOVTjfILz8qNltF-feZPR7yE,16804
+sglang/srt/server.py,sha256=DXhcJt0V24a7yhydP1abPrK1qqV3qt7r8cyOMVOAI4M,14611
+sglang/srt/server_args.py,sha256=aF6L35mEB-FU3BL_ooKuCIcOXLhYLxA9-MjpaOTQRCo,13189
+sglang/srt/utils.py,sha256=bUp3SLzbDms0dvuETaccDPAGRHOIGW5A61pqH62XiT0,20370
 sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
 sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
 sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
@@ -48,33 +39,34 @@ sglang/srt/layers/context_flashattention_nopad.py,sha256=7ps_9W_ia9zikL9HqsSUwWH
 sglang/srt/layers/extend_attention.py,sha256=aYAAL9HZJpaSASp-ulMvbmSmyMcqdYUsgVQC-Lbm7_U,12008
 sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
 sglang/srt/layers/linear.py,sha256=qLwFkOiRAljzE7LkAkLRdcCdVMk-t7b56jEjwQAuYDM,33953
-sglang/srt/layers/logits_processor.py,sha256=RCHjWxlKlB_Mc2iOMHQKvKN9gjqg4oqgodS6gr3qCbA,9672
-sglang/srt/layers/radix_attention.py,sha256=xdj4v0L5DEcQDDHSbfo_VFqdvHLAWpiT2oU8wKqz3Gk,6212
+sglang/srt/layers/logits_processor.py,sha256=KyRYANCiq9Cfu_VPjrIbSBAlqN_clcAgF3JrG9waU5k,9674
+sglang/srt/layers/radix_attention.py,sha256=A3J_wOlysjblFXHgehAqRHBQmpYAHLyUovyLFsrMJ7A,6386
 sglang/srt/layers/token_attention.py,sha256=EJ4gjbVLfshOZ_vr1iB-Eq8_B-4F26n_wPDj6e1Zrww,7386
 sglang/srt/layers/quantization/__init__.py,sha256=PQFzdPpul98DvywBA6YMBOnrMjtHE1LMlMpJ7FM8J3I,1971
 sglang/srt/layers/quantization/fp8.py,sha256=jaqgRFnHC--IL8iqB6Qygi-KXYPYBKKqt_j4Rk55_h4,24946
 sglang/srt/managers/detokenizer_manager.py,sha256=8rN2cdMr61LWy07lingEqLnNy0W5Rebdn14IsTQ9PCs,5049
 sglang/srt/managers/io_struct.py,sha256=Y6jW3p0cNg0jcrEQNki1H8MMEWxwWA4p6Y-xVgUVWaI,5404
 sglang/srt/managers/tokenizer_manager.py,sha256=SbivhFhZUR9HU9pLTe93MlYprAFAHzOU3KMBA2piQUk,19308
-sglang/srt/managers/controller/cuda_graph_runner.py,sha256=xWyLPg7vG2EAsgmSG1AI0aEk_AueyOD0-aNbK3Mt_DE,7043
-sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
-sglang/srt/managers/controller/infer_batch.py,sha256=phXzANqBUFyqFwRVl06bd5yBnGK2hem6qzf5i0lrTq0,33086
+sglang/srt/managers/controller/cuda_graph_runner.py,sha256=0aRqA1_34oJ557Zn8PjpJecex5bBWJdnCmBlcDVvYO0,8509
+sglang/srt/managers/controller/infer_batch.py,sha256=SKwCwhnZ_CNlG0mVCEc4X0e4HNjJFke-c8zdWP3TzjQ,34186
 sglang/srt/managers/controller/manager_multi.py,sha256=DT8Y9RF5OyTxlrLEZYz4claNWir3UrVztdOZaVPiA6g,6077
 sglang/srt/managers/controller/manager_single.py,sha256=2xO_iWK6tWvc0B31nKbe2N3klxwQBJmPTnFhNjzhVSI,4566
-sglang/srt/managers/controller/model_runner.py,sha256=UBvaHShjBWWFMWSEKeDh2tNqd0zWTwtfK37BbYR7c6w,13864
+sglang/srt/managers/controller/model_runner.py,sha256=FwZ7FU7nhJsYhtoTNxYFc4e6oMEwSqOh8ohXOKtFPKc,15828
 sglang/srt/managers/controller/radix_cache.py,sha256=tx8LEQpqLxipw9UUVj4D1YQLMMDmWnjDYv8oDlOl-co,8210
-sglang/srt/managers/controller/schedule_heuristic.py,sha256=tw9WEiA_pzL4dkPnoS34SYhhQ3hJXBL6K03zRm2n_g8,2482
-sglang/srt/managers/controller/tp_worker.py,sha256=uyoAW4O08UPciRYxGBPK6U5jaVuwEOvKBjaeJNNAe8s,30531
+sglang/srt/managers/controller/schedule_heuristic.py,sha256=SQAGzPS3aB_TPj7rnPBhewwyR6W1sVwW4D3zG3JUY00,2714
+sglang/srt/managers/controller/tp_worker.py,sha256=yjz-Xzl0zEy4QSU-EYneZH5vi3oHtBuXTtYe4VuDp2g,30517
 sglang/srt/model_loader/model_loader.py,sha256=VS8VQL5ITN3akZ9eU_-uHWMan1axLMNG2_O12HzGysA,10132
 sglang/srt/model_loader/utils.py,sha256=I2PS5HIH5Cg-p7xKO_Cw_foK2vQ61xVc3zQv7CbeGEw,10120
 sglang/srt/models/chatglm.py,sha256=pH8g2Dj8qQLGPYpWVTb-IONfXsdfmpWi0-IEYNdSi4s,13296
 sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
 sglang/srt/models/dbrx.py,sha256=rRxOusGPu670ommeqXg62AllwB1apzE4yZoWc1fcr2M,14095
+sglang/srt/models/deepseek.py,sha256=YtoPmv4fKmiH_jsRMSab9Wxq3aOZga9pCPGnkCs3Vvs,15457
 sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
 sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
+sglang/srt/models/gpt_bigcode.py,sha256=XHO1naPdXfiKYQRQ6uZe1fN3PBDhKH3-bchsaaZvfE4,9637
 sglang/srt/models/grok.py,sha256=611zrlIchvFaVfztRdBY7z97oU3KB-anykbOZy1hK6M,27295
 sglang/srt/models/internlm2.py,sha256=8MNcwxU5Th9IxWa314HqqmbCRlPUFScnfneBDs0riIU,11659
-sglang/srt/models/llama2.py,sha256=i97Ib4zq0-AbW7Wwp_ctFWnK528vipmlZVD_a7gB8L8,13819
+sglang/srt/models/llama2.py,sha256=OyAf_lun5aZEsT80WmrIYBF8QXTXRpW8sUlylr4AZIc,14204
 sglang/srt/models/llama_classification.py,sha256=foCPvNyP2bTZ0YcRBF-qkmBv-gT24lhLNCXP30Oq4VU,4370
 sglang/srt/models/llava.py,sha256=vBI6EEeOG_9o23Shi9h8k58rxTOHZnSKMmPl3B3Q3uc,17924
 sglang/srt/models/llavavid.py,sha256=SrNQ-U2wekHvP_up-ZXRkCSros2NzheHpPfXHrp0YBU,13050
@@ -88,16 +80,13 @@ sglang/srt/models/qwen2_moe.py,sha256=oHNoo45myV5kitkls2GWVzuGt1Q4pRHN2nLlXEltFI
 sglang/srt/models/stablelm.py,sha256=Z_XCDSHY_QMz3lZwwkZdIZjEOizZjLYJU9GDi8o08qQ,10802
 sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
 sglang/srt/openai_api/adapter.py,sha256=eirFYVGIp5D-UrQLqW5dRJOQYKmzF9nmgCzFeUOb2z8,15737
-sglang/srt/openai_api/api_adapter.py,sha256=eirFYVGIp5D-UrQLqW5dRJOQYKmzF9nmgCzFeUOb2z8,15737
-sglang/srt/openai_api/openai_api_adapter.py,sha256=5pDaktIEteHxp3qN89U_U3ndd7N0FIfUZAM06YeziUY,15687
-sglang/srt/openai_api/openai_protocol.py,sha256=lGBhfxG6jmgUkMOh2NpBK9w9TUTRZKrsfHdW7XYhKKI,5700
 sglang/srt/openai_api/protocol.py,sha256=j7ifIR2SFQxTwaHAd9ksM096vfffcNltzTH4sg7H0RA,5739
 sglang/test/test_conversation.py,sha256=gF_AyOxQgpPQBPnA57-kq-M0p_zFu-rBDMFgAq655Rw,1596
 sglang/test/test_openai_protocol.py,sha256=DVx3r6hrb8oRqbo5AYIleldxbqMBTtb-gtORM6t_Y1c,1661
 sglang/test/test_programs.py,sha256=uefeHUFKT2NJESOujj-CsnPXdw1aQQN2TzUbPCHJjGs,13654
 sglang/test/test_utils.py,sha256=kD_fQe3WroZ9Kc3NBRKPiZOFJ_JD2uEE9XIvPp6AD9Y,11048
-sglang-0.1.22.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-sglang-0.1.22.dist-info/METADATA,sha256=O1pihQWf_523B_fgluftctwOxcou6oj13_Wuquj7ztU,30691
-sglang-0.1.22.dist-info/WHEEL,sha256=rWxmBtp7hEUqVLOnTaDOPpR-cZpCDkzhhcBce-Zyd5k,91
-sglang-0.1.22.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
-sglang-0.1.22.dist-info/RECORD,,
+sglang-0.1.25.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sglang-0.1.25.dist-info/METADATA,sha256=Ifwh2YdZqQXMe2UCOklWFIGeM0KLkfLjBQHv98gS8Pw,30928
+sglang-0.1.25.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
+sglang-0.1.25.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
+sglang-0.1.25.dist-info/RECORD,,

{sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (71.0.4)
+Generator: setuptools (71.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

sglang/backend/__init__.py DELETED Viewed

File without changes

sglang/backend/anthropic.py DELETED Viewed

@@ -1,77 +0,0 @@
-from typing import List, Optional, Union
-import numpy as np
-from sglang.backend.base_backend import BaseBackend
-from sglang.lang.chat_template import get_chat_template
-from sglang.lang.interpreter import StreamExecutor
-from sglang.lang.ir import SglSamplingParams
-try:
-    import anthropic
-except ImportError as e:
-    anthropic = e
-class Anthropic(BaseBackend):
-    def __init__(self, model_name, *args, **kwargs):
-        super().__init__()
-        if isinstance(anthropic, Exception):
-            raise anthropic
-        self.model_name = model_name
-        self.chat_template = get_chat_template("claude")
-        self.client = anthropic.Anthropic(*args, **kwargs)
-    def get_chat_template(self):
-        return self.chat_template
-    def generate(
-        self,
-        s: StreamExecutor,
-        sampling_params: SglSamplingParams,
-    ):
-        if s.messages_:
-            messages = s.messages_
-        else:
-            messages = [{"role": "user", "content": s.text_}]
-        if messages and messages[0]["role"] == "system":
-            system = messages.pop(0)["content"]
-        else:
-            system = ""
-        ret = self.client.messages.create(
-            model=self.model_name,
-            system=system,
-            messages=messages,
-            **sampling_params.to_anthropic_kwargs(),
-        )
-        comp = ret.content[0].text
-        return comp, {}
-    def generate_stream(
-        self,
-        s: StreamExecutor,
-        sampling_params: SglSamplingParams,
-    ):
-        if s.messages_:
-            messages = s.messages_
-        else:
-            messages = [{"role": "user", "content": s.text_}]
-        if messages and messages[0]["role"] == "system":
-            system = messages.pop(0)["content"]
-        else:
-            system = ""
-        with self.client.messages.stream(
-            model=self.model_name,
-            system=system,
-            messages=messages,
-            **sampling_params.to_anthropic_kwargs(),
-        ) as stream:
-            for text in stream.text_stream:
-                yield text, {}

sglang 0.1.22__py3-none-any.whl → 0.1.25__py3-none-any.whl

sglang 0.1.22py3-none-any.whl → 0.1.25py3-none-any.whl