PyPI - sglang - Versions diffs - 0.4.8.post1__tar.gz → 0.4.9__tar.gz - Mend

sglang 0.4.8.post1tar.gz → 0.4.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (740) hide show

{sglang-0.4.8.post1/sglang.egg-info → sglang-0.4.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.8.post1
+Version: 0.4.9
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -219,6 +219,7 @@ Requires-Dist: IPython
 Requires-Dist: setproctitle
 Provides-Extra: runtime-common
 Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
+Requires-Dist: build; extra == "runtime-common"
 Requires-Dist: compressed-tensors; extra == "runtime-common"
 Requires-Dist: datasets; extra == "runtime-common"
 Requires-Dist: fastapi; extra == "runtime-common"
@@ -243,19 +244,20 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
 Requires-Dist: scipy; extra == "runtime-common"
 Requires-Dist: torchao==0.9.0; extra == "runtime-common"
-Requires-Dist: transformers==4.52.3; extra == "runtime-common"
+Requires-Dist: transformers==4.53.0; extra == "runtime-common"
+Requires-Dist: timm==1.0.16; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
 Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.1.9; extra == "srt"
+Requires-Dist: sgl-kernel==0.2.4; extra == "srt"
 Requires-Dist: torch==2.7.1; extra == "srt"
 Requires-Dist: torchaudio==2.7.1; extra == "srt"
 Requires-Dist: torchvision==0.22.1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
 Requires-Dist: einops; extra == "srt"
-Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
+Requires-Dist: flashinfer_python==0.2.7.post1; extra == "srt"
 Provides-Extra: blackwell
 Requires-Dist: sglang[runtime_common]; extra == "blackwell"
 Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -264,7 +266,7 @@ Requires-Dist: torchaudio==2.7.1; extra == "blackwell"
 Requires-Dist: torchvision==0.22.1; extra == "blackwell"
 Requires-Dist: cuda-python; extra == "blackwell"
 Requires-Dist: einops; extra == "blackwell"
-Requires-Dist: flashinfer_python==0.2.6.post1; extra == "blackwell"
+Requires-Dist: flashinfer_python==0.2.7.post1; extra == "blackwell"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
@@ -295,7 +297,6 @@ Requires-Dist: jsonlines; extra == "test"
 Requires-Dist: matplotlib; extra == "test"
 Requires-Dist: pandas; extra == "test"
 Requires-Dist: peft; extra == "test"
-Requires-Dist: timm; extra == "test"
 Requires-Dist: sentence_transformers; extra == "test"
 Provides-Extra: all
 Requires-Dist: sglang[srt]; extra == "all"
@@ -373,6 +374,8 @@ Dynamic: license-file
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
+- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
+- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
 - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
 - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
 - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))

{sglang-0.4.8.post1 → sglang-0.4.9}/README.md RENAMED Viewed

@@ -20,6 +20,8 @@
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
+- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
+- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
 - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
 - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
 - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))

{sglang-0.4.8.post1 → sglang-0.4.9}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.4.8.post1"
+version = "0.4.9"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -18,6 +18,7 @@ dependencies = ["aiohttp", "requests", "tqdm", "numpy", "IPython", "setproctitle
 [project.optional-dependencies]
 runtime_common = [
     "blobfile==3.0.0",
+    "build",
     "compressed-tensors",
     "datasets",
     "fastapi",
@@ -42,7 +43,8 @@ runtime_common = [
     "soundfile==0.13.1",
     "scipy",
     "torchao==0.9.0",
-    "transformers==4.52.3",
+    "transformers==4.53.0",
+    "timm==1.0.16",
     "uvicorn",
     "uvloop",
     "xgrammar==0.1.19",
@@ -50,13 +52,13 @@ runtime_common = [
 srt = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.1.9",
+    "sgl-kernel==0.2.4",
     "torch==2.7.1",
     "torchaudio==2.7.1",
     "torchvision==0.22.1",
     "cuda-python",
     "einops",
-    "flashinfer_python==0.2.6.post1",
+    "flashinfer_python==0.2.7.post1",
 ]
 blackwell = [
@@ -67,7 +69,7 @@ blackwell = [
     "torchvision==0.22.1",
     "cuda-python",
     "einops",
-    "flashinfer_python==0.2.6.post1",
+    "flashinfer_python==0.2.7.post1",
 ]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
@@ -86,9 +88,7 @@ srt_xpu = ["sglang[runtime_common]"]
 # https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
 srt_hpu = ["sglang[runtime_common]"]
-# CPU: currently, there are no pre-built vllm wheels for CPU.
-# To install vllm for CPU, please follow the instruction here:
-# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
+# CPU: torch wheel for CPU needs to be installed from https://download.pytorch.org/whl/cpu
 srt_cpu = ["sglang[runtime_common]", "einops"]
 # https://vllm-ascend.readthedocs.io/en/latest/installation.html
 srt_npu = ["sglang[runtime_common]"]
@@ -104,7 +104,6 @@ test = [
     "matplotlib",
     "pandas",
     "peft",
-    "timm",
     "sentence_transformers",
 ]
 all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[torch_memory_saver]", "sglang[decord]"]

{sglang-0.4.8.post1 → sglang-0.4.9}/sglang/bench_one_batch_server.py RENAMED Viewed

@@ -38,6 +38,7 @@ class BenchArgs:
     output_len: Tuple[int] = (16,)
     temperature: float = 0.0
     return_logprob: bool = False
+    client_stream_interval: int = 1
     input_len_step_percentage: float = 0.0
     result_filename: str = "result.jsonl"
     base_url: str = ""
@@ -60,6 +61,11 @@ class BenchArgs:
         )
         parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
         parser.add_argument("--return-logprob", action="store_true")
+        parser.add_argument(
+            "--client-stream-interval",
+            type=int,
+            default=BenchArgs.client_stream_interval,
+        )
         parser.add_argument(
             "--input-len-step-percentage",
             type=float,
@@ -120,6 +126,7 @@ def run_one_case(
     output_len: int,
     temperature: float,
     return_logprob: bool,
+    stream_interval: int,
     input_len_step_percentage: float,
     run_name: str,
     result_filename: str,
@@ -168,6 +175,7 @@ def run_one_case(
                 "max_new_tokens": output_len,
                 "ignore_eos": True,
                 "json_schema": json_schema,
+                "stream_interval": stream_interval,
             },
             "return_logprob": return_logprob,
             "stream": True,
@@ -245,8 +253,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
     else:
         proc, base_url = launch_server_process(server_args)
-    tokenizer_id = server_args.tokenizer_path or server_args.model_path
-    tokenizer = get_tokenizer(tokenizer_id)
+    server_info = requests.get(base_url + "/get_server_info").json()
+    if "tokenizer_path" in server_info:
+        tokenizer_path = server_info["tokenizer_path"]
+    elif "prefill" in server_info:
+        tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
+    tokenizer = get_tokenizer(tokenizer_path)
     # warmup
     if not bench_args.skip_warmup:
@@ -258,6 +270,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
             output_len=16,
             temperature=bench_args.temperature,
             return_logprob=bench_args.return_logprob,
+            stream_interval=bench_args.client_stream_interval,
             input_len_step_percentage=bench_args.input_len_step_percentage,
             run_name="",
             result_filename="",
@@ -280,6 +293,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                     ol,
                     temperature=bench_args.temperature,
                     return_logprob=bench_args.return_logprob,
+                    stream_interval=bench_args.client_stream_interval,
                     input_len_step_percentage=bench_args.input_len_step_percentage,
                     run_name=bench_args.run_name,
                     result_filename=bench_args.result_filename,
@@ -301,6 +315,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                                 ol,
                                 temperature=bench_args.temperature,
                                 return_logprob=bench_args.return_logprob,
+                                stream_interval=bench_args.client_stream_interval,
                                 input_len_step_percentage=bench_args.input_len_step_percentage,
                                 run_name=bench_args.run_name,
                                 result_filename=bench_args.result_filename,

{sglang-0.4.8.post1 → sglang-0.4.9}/sglang/bench_serving.py RENAMED Viewed

@@ -265,6 +265,138 @@ async def async_request_openai_completions(
     return output
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Makes a request to the OpenAI Chat Completions API.
+    Handles both streaming and non-streaming responses, including support
+    for image data in messages. Calculates and returns various performance
+    metrics.
+    Args:
+        request_func_input: Input parameters for the request.
+        pbar: Optional tqdm progress bar to update.
+    Returns:
+        RequestFuncOutput: Output of the request, including generated text,
+                           latency, TTFT, ITL, and success status.
+    """
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
+    if request_func_input.image_data:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": request_func_input.image_data},
+                    },
+                    {"type": "text", "text": request_func_input.prompt},
+                ],
+            },
+        ]
+    else:
+        messages = [{"role": "user", "content": request_func_input.prompt}]
+    async with _create_bench_client_session() as session:
+        payload = {
+            "model": request_func_input.model,
+            "messages": messages,
+            "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": not args.disable_stream,
+            **request_func_input.extra_request_body,
+        }
+        headers = get_auth_headers()
+        output = RequestFuncOutput.init_new(request_func_input)
+        generated_text = ""
+        output_len = request_func_input.output_len
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    if args.disable_stream:
+                        # Non-streaming response
+                        response_json = await response.json()
+                        output.generated_text = response_json["choices"][0]["message"][
+                            "content"
+                        ]
+                        output.success = True
+                        output.latency = time.perf_counter() - st
+                        output.ttft = (
+                            output.latency
+                        )  # For non-streaming, TTFT = total latency
+                        output.output_len = response_json.get("usage", {}).get(
+                            "completion_tokens", output_len
+                        )
+                    else:
+                        # Streaming response
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+                            chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                            latency = time.perf_counter() - st
+                            if chunk == "[DONE]":
+                                pass
+                            else:
+                                data = json.loads(chunk)
+                                # Check if this chunk contains content
+                                delta = data.get("choices", [{}])[0].get("delta", {})
+                                content = delta.get("content", "")
+                                if content:
+                                    timestamp = time.perf_counter()
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp
+                                        )
+                                    most_recent_timestamp = timestamp
+                                    generated_text += content
+                                # Check for usage info in final chunk
+                                output_len = (data.get("usage") or {}).get(
+                                    "completion_tokens", output_len
+                                )
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = latency
+                        output.output_len = output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+    if pbar:
+        pbar.update(1)
+    return output
 async def async_request_truss(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
@@ -544,6 +676,7 @@ def get_dataset(args, tokenizer):
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             fixed_output_len=args.random_output_len,
+            apply_chat_template=args.apply_chat_template,
             random_sample=True,
         )
     else:
@@ -555,8 +688,11 @@ ASYNC_REQUEST_FUNCS = {
     "sglang": async_request_sglang_generate,
     "sglang-native": async_request_sglang_generate,
     "sglang-oai": async_request_openai_completions,
+    "sglang-oai-chat": async_request_openai_chat_completions,
     "vllm": async_request_openai_completions,
+    "vllm-chat": async_request_openai_chat_completions,
     "lmdeploy": async_request_openai_completions,
+    "lmdeploy-chat": async_request_openai_chat_completions,
     "trt": async_request_trt_llm,
     "gserver": async_request_gserver,
     "truss": async_request_truss,
@@ -661,6 +797,7 @@ def sample_mmmu_requests(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int] = None,
+    apply_chat_template: bool = True,
     random_sample: bool = True,
 ) -> List[DatasetRow]:
     """
@@ -670,6 +807,7 @@ def sample_mmmu_requests(
         num_requests: Number of requests to sample.
         tokenizer: Tokenizer to use for token counting.
         fixed_output_len: If provided, use this fixed output length for all requests.
+        apply_chat_template: Whether to apply the chat template to the prompt.
         random_sample: Whether to randomly sample or take the first N.
     Returns:
@@ -739,28 +877,30 @@ def sample_mmmu_requests(
                 # Construct the prompt
                 prompt = f"Question: {question}\n\nAnswer: "
-                try:
-                    prompt = tokenizer.apply_chat_template(
-                        [
-                            {
-                                "role": "user",
-                                "content": [
-                                    {
-                                        "type": "image_url",
-                                        "image_url": {"url": image_data},
-                                    },
-                                    {"type": "text", "text": prompt},
-                                ],
-                            }
-                        ],
-                        add_generation_prompt=True,
-                        tokenize=False,
-                    )
-                except Exception as e:
-                    # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
-                    print(f"Error applying chat template: {e}, fallback to <image> tag")
-                    prompt = f"<image>{prompt}"
+                if apply_chat_template:
+                    try:
+                        prompt = tokenizer.apply_chat_template(
+                            [
+                                {
+                                    "role": "user",
+                                    "content": [
+                                        {
+                                            "type": "image_url",
+                                            "image_url": {"url": image_data},
+                                        },
+                                        {"type": "text", "text": prompt},
+                                    ],
+                                }
+                            ],
+                            add_generation_prompt=True,
+                            tokenize=False,
+                        )
+                    except Exception as e:
+                        # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
+                        print(
+                            f"Error applying chat template: {e}, fallback to <image> tag"
+                        )
+                        prompt = f"<image>{prompt}"
                 # Calculate token lengths for text only (without image data)
                 prompt_token_ids = tokenizer.encode(prompt)
@@ -1544,6 +1684,12 @@ def run_benchmark(args_: argparse.Namespace):
             if args.base_url
             else f"http://{args.host}:{args.port}/v1/completions"
         )
+    elif args.backend in ["sglang-oai-chat", "vllm-chat", "lmdeploy-chat"]:
+        api_url = (
+            f"{args.base_url}/v1/chat/completions"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v1/chat/completions"
+        )
     elif args.backend == "trt":
         api_url = (
             f"{args.base_url}/v2/models/ensemble/generate_stream"

{sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/internvl.py RENAMED Viewed

@@ -147,12 +147,14 @@ class InternLM2Config(PretrainedConfig):
             )
         if (
             rope_scaling_factor is None
-            or not isinstance(rope_scaling_factor, float)
+            or not isinstance(rope_scaling_factor, (float, int))
             or rope_scaling_factor < 1.0
         ):
             raise ValueError(
-                f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}"
+                f"`rope_scaling`'s factor field must be a float|int >= 1, got {rope_scaling_factor=}, {type(rope_scaling_factor)=}"
             )
+        if isinstance(rope_scaling_factor, int):
+            rope_scaling_factor = float(rope_scaling_factor)
 class InternVisionConfig(PretrainedConfig):

{sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/janus_pro.py RENAMED Viewed

@@ -19,7 +19,7 @@ from transformers import (
 from transformers.image_utils import to_numpy_array
 from sglang.srt.configs.utils import register_image_processor, register_processor
-from sglang.srt.mm_utils import expand2square
+from sglang.srt.multimodal.mm_utils import expand2square
 class DictToObject(dict):

{sglang-0.4.8.post1 → sglang-0.4.9}/sglang/srt/configs/model_config.py RENAMED Viewed

@@ -59,6 +59,7 @@ class ModelConfig:
         quantization: Optional[str] = None,
         override_config_file: Optional[str] = None,
         is_draft_model: bool = False,
+        hybrid_kvcache_ratio: Optional[float] = None,
         impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
@@ -86,6 +87,18 @@ class ModelConfig:
         self.attention_chunk_size = getattr(
             self.hf_text_config, "attention_chunk_size", None
         )
+        self.is_hybrid = is_hybrid_model(
+            self.hf_config.architectures,
+            hybrid_kvcache_ratio=hybrid_kvcache_ratio,
+            context_length=context_length,
+            attention_chunk_size=self.attention_chunk_size,
+        )
+        if self.is_hybrid is not None:
+            self.swa_attention_layer_ids, self.full_attention_layer_ids = (
+                get_hybrid_layer_ids(
+                    self.hf_config.architectures, self.hf_text_config.num_hidden_layers
+                )
+            )
         if enable_multimodal is None:
             mm_disabled_models = [
@@ -264,6 +277,7 @@ class ModelConfig:
             enable_multimodal=server_args.enable_multimodal,
             dtype=server_args.dtype,
             quantization=server_args.quantization,
+            hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
             impl=server_args.impl,
             **kwargs,
         )
@@ -579,6 +593,7 @@ multimodal_model_archs = [
     "Mistral3ForConditionalGeneration",
     "MultiModalityCausalLM",
     "MllamaForConditionalGeneration",
+    "Qwen2AudioForConditionalGeneration",
     "Qwen2VLForConditionalGeneration",
     "Qwen2_5_VLForConditionalGeneration",
     "KimiVLForConditionalGeneration",
@@ -633,3 +648,36 @@ def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
     if scale <= 1:
         return 1.0
     return 0.1 * mscale * math.log(scale) + 1.0
+def is_hybrid_model(
+    model_architectures: List[str],
+    hybrid_kvcache_ratio: Optional[float],
+    context_length: Optional[int],
+    attention_chunk_size: Optional[int],
+):
+    if hybrid_kvcache_ratio is None:
+        return None
+    elif (
+        hybrid_kvcache_ratio > 0
+        and model_architectures[0] == "Llama4ForConditionalGeneration"
+        and context_length > attention_chunk_size
+    ):
+        return hybrid_kvcache_ratio
+    else:
+        return None
+def get_hybrid_layer_ids(model_architectures: List[str], num_hidden_layers: int):
+    if "Llama4ForConditionalGeneration" in model_architectures:
+        swa_attention_layer_ids = [
+            i for i in range(num_hidden_layers) if (i + 1) % 4 != 0
+        ]
+        full_attention_layer_ids = [
+            i for i in range(num_hidden_layers) if (i + 1) % 4 == 0
+        ]
+    else:
+        raise ValueError(
+            "get_hybrid_layer_ids is only implemented for Llama4ForConditionalGeneration"
+        )
+    return swa_attention_layer_ids, full_attention_layer_ids

sglang 0.4.8.post1__tar.gz → 0.4.9__tar.gz

sglang 0.4.8.post1tar.gz → 0.4.9tar.gz