PyPI - sglang - Versions diffs - 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl - Mend

sglang 0.4.8py3-none-any.whl → 0.4.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_one_batch_server.py +17 -2
sglang/bench_serving.py +168 -22
sglang/srt/configs/internvl.py +4 -2
sglang/srt/configs/janus_pro.py +1 -1
sglang/srt/configs/model_config.py +49 -0
sglang/srt/configs/update_config.py +119 -0
sglang/srt/conversation.py +35 -0
sglang/srt/custom_op.py +7 -1
sglang/srt/disaggregation/base/conn.py +2 -0
sglang/srt/disaggregation/decode.py +22 -6
sglang/srt/disaggregation/mooncake/conn.py +289 -48
sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
sglang/srt/disaggregation/nixl/conn.py +100 -52
sglang/srt/disaggregation/prefill.py +5 -4
sglang/srt/disaggregation/utils.py +13 -12
sglang/srt/distributed/parallel_state.py +44 -17
sglang/srt/entrypoints/EngineBase.py +8 -0
sglang/srt/entrypoints/engine.py +45 -9
sglang/srt/entrypoints/http_server.py +111 -24
sglang/srt/entrypoints/openai/protocol.py +51 -6
sglang/srt/entrypoints/openai/serving_chat.py +52 -76
sglang/srt/entrypoints/openai/serving_completions.py +1 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/eplb/__init__.py +0 -0
sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
sglang/srt/{managers → eplb}/expert_distribution.py +18 -1
sglang/srt/{managers → eplb}/expert_location.py +1 -1
sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
sglang/srt/hf_transformers_utils.py +2 -1
sglang/srt/layers/activation.py +7 -0
sglang/srt/layers/amx_utils.py +86 -0
sglang/srt/layers/attention/ascend_backend.py +219 -0
sglang/srt/layers/attention/flashattention_backend.py +56 -23
sglang/srt/layers/attention/tbo_backend.py +37 -9
sglang/srt/layers/communicator.py +18 -2
sglang/srt/layers/dp_attention.py +9 -3
sglang/srt/layers/elementwise.py +76 -12
sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
sglang/srt/layers/layernorm.py +41 -0
sglang/srt/layers/linear.py +99 -12
sglang/srt/layers/logits_processor.py +15 -6
sglang/srt/layers/moe/ep_moe/kernels.py +23 -8
sglang/srt/layers/moe/ep_moe/layer.py +115 -25
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +42 -19
sglang/srt/layers/moe/fused_moe_native.py +7 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -4
sglang/srt/layers/moe/fused_moe_triton/layer.py +129 -10
sglang/srt/layers/moe/router.py +60 -22
sglang/srt/layers/moe/topk.py +36 -28
sglang/srt/layers/parameter.py +67 -7
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
sglang/srt/layers/quantization/fp8.py +44 -0
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +6 -6
sglang/srt/layers/quantization/gptq.py +5 -1
sglang/srt/layers/quantization/moe_wna16.py +1 -1
sglang/srt/layers/quantization/quant_utils.py +166 -0
sglang/srt/layers/quantization/w8a8_int8.py +52 -1
sglang/srt/layers/rotary_embedding.py +105 -13
sglang/srt/layers/vocab_parallel_embedding.py +19 -2
sglang/srt/lora/lora.py +4 -5
sglang/srt/lora/lora_manager.py +73 -20
sglang/srt/managers/configure_logging.py +1 -1
sglang/srt/managers/io_struct.py +60 -15
sglang/srt/managers/mm_utils.py +73 -59
sglang/srt/managers/multimodal_processor.py +2 -6
sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
sglang/srt/managers/schedule_batch.py +80 -79
sglang/srt/managers/scheduler.py +153 -63
sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
sglang/srt/managers/session_controller.py +12 -3
sglang/srt/managers/tokenizer_manager.py +314 -103
sglang/srt/managers/tp_worker.py +13 -1
sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
sglang/srt/mem_cache/allocator.py +290 -0
sglang/srt/mem_cache/chunk_cache.py +34 -2
sglang/srt/mem_cache/memory_pool.py +289 -3
sglang/srt/mem_cache/multimodal_cache.py +3 -0
sglang/srt/model_executor/cuda_graph_runner.py +3 -2
sglang/srt/model_executor/forward_batch_info.py +17 -4
sglang/srt/model_executor/model_runner.py +302 -58
sglang/srt/model_loader/loader.py +86 -10
sglang/srt/model_loader/weight_utils.py +160 -3
sglang/srt/models/deepseek_nextn.py +5 -4
sglang/srt/models/deepseek_v2.py +305 -26
sglang/srt/models/deepseek_vl2.py +3 -5
sglang/srt/models/gemma3_causal.py +1 -2
sglang/srt/models/gemma3n_audio.py +949 -0
sglang/srt/models/gemma3n_causal.py +1010 -0
sglang/srt/models/gemma3n_mm.py +495 -0
sglang/srt/models/hunyuan.py +771 -0
sglang/srt/models/kimi_vl.py +1 -2
sglang/srt/models/llama.py +10 -4
sglang/srt/models/llama4.py +32 -45
sglang/srt/models/llama_eagle3.py +61 -11
sglang/srt/models/llava.py +5 -5
sglang/srt/models/minicpmo.py +2 -2
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mllama4.py +43 -11
sglang/srt/models/phi4mm.py +1 -3
sglang/srt/models/pixtral.py +3 -7
sglang/srt/models/qwen2.py +31 -3
sglang/srt/models/qwen2_5_vl.py +1 -3
sglang/srt/models/qwen2_audio.py +200 -0
sglang/srt/models/qwen2_moe.py +32 -6
sglang/srt/models/qwen2_vl.py +1 -4
sglang/srt/models/qwen3.py +94 -25
sglang/srt/models/qwen3_moe.py +68 -21
sglang/srt/models/vila.py +3 -8
sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +150 -133
sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
sglang/srt/multimodal/processors/gemma3n.py +82 -0
sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +3 -6
sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
sglang/srt/operations_strategy.py +6 -2
sglang/srt/reasoning_parser.py +26 -0
sglang/srt/sampling/sampling_batch_info.py +39 -1
sglang/srt/server_args.py +85 -24
sglang/srt/speculative/build_eagle_tree.py +57 -18
sglang/srt/speculative/eagle_worker.py +6 -4
sglang/srt/two_batch_overlap.py +204 -28
sglang/srt/utils.py +369 -138
sglang/srt/warmup.py +12 -3
sglang/test/runners.py +10 -1
sglang/test/test_utils.py +15 -3
sglang/version.py +1 -1
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/METADATA +9 -6
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/RECORD +149 -137
sglang/math_utils.py +0 -8
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
/sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
/sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +0 -0
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/WHEEL +0 -0
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/top_level.txt +0 -0

sglang/bench_one_batch_server.py CHANGED Viewed

@@ -38,6 +38,7 @@ class BenchArgs:
     output_len: Tuple[int] = (16,)
     temperature: float = 0.0
     return_logprob: bool = False
+    client_stream_interval: int = 1
     input_len_step_percentage: float = 0.0
     result_filename: str = "result.jsonl"
     base_url: str = ""
@@ -60,6 +61,11 @@ class BenchArgs:
         )
         parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
         parser.add_argument("--return-logprob", action="store_true")
+        parser.add_argument(
+            "--client-stream-interval",
+            type=int,
+            default=BenchArgs.client_stream_interval,
+        )
         parser.add_argument(
             "--input-len-step-percentage",
             type=float,
@@ -120,6 +126,7 @@ def run_one_case(
     output_len: int,
     temperature: float,
     return_logprob: bool,
+    stream_interval: int,
     input_len_step_percentage: float,
     run_name: str,
     result_filename: str,
@@ -168,6 +175,7 @@ def run_one_case(
                 "max_new_tokens": output_len,
                 "ignore_eos": True,
                 "json_schema": json_schema,
+                "stream_interval": stream_interval,
             },
             "return_logprob": return_logprob,
             "stream": True,
@@ -245,8 +253,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
     else:
         proc, base_url = launch_server_process(server_args)
-    tokenizer_id = server_args.tokenizer_path or server_args.model_path
-    tokenizer = get_tokenizer(tokenizer_id)
+    server_info = requests.get(base_url + "/get_server_info").json()
+    if "tokenizer_path" in server_info:
+        tokenizer_path = server_info["tokenizer_path"]
+    elif "prefill" in server_info:
+        tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
+    tokenizer = get_tokenizer(tokenizer_path)
     # warmup
     if not bench_args.skip_warmup:
@@ -258,6 +270,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
             output_len=16,
             temperature=bench_args.temperature,
             return_logprob=bench_args.return_logprob,
+            stream_interval=bench_args.client_stream_interval,
             input_len_step_percentage=bench_args.input_len_step_percentage,
             run_name="",
             result_filename="",
@@ -280,6 +293,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                     ol,
                     temperature=bench_args.temperature,
                     return_logprob=bench_args.return_logprob,
+                    stream_interval=bench_args.client_stream_interval,
                     input_len_step_percentage=bench_args.input_len_step_percentage,
                     run_name=bench_args.run_name,
                     result_filename=bench_args.result_filename,
@@ -301,6 +315,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                                 ol,
                                 temperature=bench_args.temperature,
                                 return_logprob=bench_args.return_logprob,
+                                stream_interval=bench_args.client_stream_interval,
                                 input_len_step_percentage=bench_args.input_len_step_percentage,
                                 run_name=bench_args.run_name,
                                 result_filename=bench_args.result_filename,

sglang/bench_serving.py CHANGED Viewed

@@ -265,6 +265,138 @@ async def async_request_openai_completions(
     return output
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Makes a request to the OpenAI Chat Completions API.
+    Handles both streaming and non-streaming responses, including support
+    for image data in messages. Calculates and returns various performance
+    metrics.
+    Args:
+        request_func_input: Input parameters for the request.
+        pbar: Optional tqdm progress bar to update.
+    Returns:
+        RequestFuncOutput: Output of the request, including generated text,
+                           latency, TTFT, ITL, and success status.
+    """
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
+    if request_func_input.image_data:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": request_func_input.image_data},
+                    },
+                    {"type": "text", "text": request_func_input.prompt},
+                ],
+            },
+        ]
+    else:
+        messages = [{"role": "user", "content": request_func_input.prompt}]
+    async with _create_bench_client_session() as session:
+        payload = {
+            "model": request_func_input.model,
+            "messages": messages,
+            "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": not args.disable_stream,
+            **request_func_input.extra_request_body,
+        }
+        headers = get_auth_headers()
+        output = RequestFuncOutput.init_new(request_func_input)
+        generated_text = ""
+        output_len = request_func_input.output_len
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    if args.disable_stream:
+                        # Non-streaming response
+                        response_json = await response.json()
+                        output.generated_text = response_json["choices"][0]["message"][
+                            "content"
+                        ]
+                        output.success = True
+                        output.latency = time.perf_counter() - st
+                        output.ttft = (
+                            output.latency
+                        )  # For non-streaming, TTFT = total latency
+                        output.output_len = response_json.get("usage", {}).get(
+                            "completion_tokens", output_len
+                        )
+                    else:
+                        # Streaming response
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+                            chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                            latency = time.perf_counter() - st
+                            if chunk == "[DONE]":
+                                pass
+                            else:
+                                data = json.loads(chunk)
+                                # Check if this chunk contains content
+                                delta = data.get("choices", [{}])[0].get("delta", {})
+                                content = delta.get("content", "")
+                                if content:
+                                    timestamp = time.perf_counter()
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp
+                                        )
+                                    most_recent_timestamp = timestamp
+                                    generated_text += content
+                                # Check for usage info in final chunk
+                                output_len = (data.get("usage") or {}).get(
+                                    "completion_tokens", output_len
+                                )
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = latency
+                        output.output_len = output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+    if pbar:
+        pbar.update(1)
+    return output
 async def async_request_truss(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
@@ -544,6 +676,7 @@ def get_dataset(args, tokenizer):
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             fixed_output_len=args.random_output_len,
+            apply_chat_template=args.apply_chat_template,
             random_sample=True,
         )
     else:
@@ -555,8 +688,11 @@ ASYNC_REQUEST_FUNCS = {
     "sglang": async_request_sglang_generate,
     "sglang-native": async_request_sglang_generate,
     "sglang-oai": async_request_openai_completions,
+    "sglang-oai-chat": async_request_openai_chat_completions,
     "vllm": async_request_openai_completions,
+    "vllm-chat": async_request_openai_chat_completions,
     "lmdeploy": async_request_openai_completions,
+    "lmdeploy-chat": async_request_openai_chat_completions,
     "trt": async_request_trt_llm,
     "gserver": async_request_gserver,
     "truss": async_request_truss,
@@ -661,6 +797,7 @@ def sample_mmmu_requests(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int] = None,
+    apply_chat_template: bool = True,
     random_sample: bool = True,
 ) -> List[DatasetRow]:
     """
@@ -670,6 +807,7 @@ def sample_mmmu_requests(
         num_requests: Number of requests to sample.
         tokenizer: Tokenizer to use for token counting.
         fixed_output_len: If provided, use this fixed output length for all requests.
+        apply_chat_template: Whether to apply the chat template to the prompt.
         random_sample: Whether to randomly sample or take the first N.
     Returns:
@@ -739,28 +877,30 @@ def sample_mmmu_requests(
                 # Construct the prompt
                 prompt = f"Question: {question}\n\nAnswer: "
-                try:
-                    prompt = tokenizer.apply_chat_template(
-                        [
-                            {
-                                "role": "user",
-                                "content": [
-                                    {
-                                        "type": "image_url",
-                                        "image_url": {"url": image_data},
-                                    },
-                                    {"type": "text", "text": prompt},
-                                ],
-                            }
-                        ],
-                        add_generation_prompt=True,
-                        tokenize=False,
-                    )
-                except Exception as e:
-                    # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
-                    print(f"Error applying chat template: {e}, fallback to <image> tag")
-                    prompt = f"<image>{prompt}"
+                if apply_chat_template:
+                    try:
+                        prompt = tokenizer.apply_chat_template(
+                            [
+                                {
+                                    "role": "user",
+                                    "content": [
+                                        {
+                                            "type": "image_url",
+                                            "image_url": {"url": image_data},
+                                        },
+                                        {"type": "text", "text": prompt},
+                                    ],
+                                }
+                            ],
+                            add_generation_prompt=True,
+                            tokenize=False,
+                        )
+                    except Exception as e:
+                        # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
+                        print(
+                            f"Error applying chat template: {e}, fallback to <image> tag"
+                        )
+                        prompt = f"<image>{prompt}"
                 # Calculate token lengths for text only (without image data)
                 prompt_token_ids = tokenizer.encode(prompt)
@@ -1544,6 +1684,12 @@ def run_benchmark(args_: argparse.Namespace):
             if args.base_url
             else f"http://{args.host}:{args.port}/v1/completions"
         )
+    elif args.backend in ["sglang-oai-chat", "vllm-chat", "lmdeploy-chat"]:
+        api_url = (
+            f"{args.base_url}/v1/chat/completions"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v1/chat/completions"
+        )
     elif args.backend == "trt":
         api_url = (
             f"{args.base_url}/v2/models/ensemble/generate_stream"

sglang/srt/configs/internvl.py CHANGED Viewed

@@ -147,12 +147,14 @@ class InternLM2Config(PretrainedConfig):
             )
         if (
             rope_scaling_factor is None
-            or not isinstance(rope_scaling_factor, float)
+            or not isinstance(rope_scaling_factor, (float, int))
             or rope_scaling_factor < 1.0
         ):
             raise ValueError(
-                f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}"
+                f"`rope_scaling`'s factor field must be a float|int >= 1, got {rope_scaling_factor=}, {type(rope_scaling_factor)=}"
             )
+        if isinstance(rope_scaling_factor, int):
+            rope_scaling_factor = float(rope_scaling_factor)
 class InternVisionConfig(PretrainedConfig):

sglang/srt/configs/janus_pro.py CHANGED Viewed

@@ -19,7 +19,7 @@ from transformers import (
 from transformers.image_utils import to_numpy_array
 from sglang.srt.configs.utils import register_image_processor, register_processor
-from sglang.srt.mm_utils import expand2square
+from sglang.srt.multimodal.mm_utils import expand2square
 class DictToObject(dict):

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -59,6 +59,7 @@ class ModelConfig:
         quantization: Optional[str] = None,
         override_config_file: Optional[str] = None,
         is_draft_model: bool = False,
+        hybrid_kvcache_ratio: Optional[float] = None,
         impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
@@ -86,6 +87,18 @@ class ModelConfig:
         self.attention_chunk_size = getattr(
             self.hf_text_config, "attention_chunk_size", None
         )
+        self.is_hybrid = is_hybrid_model(
+            self.hf_config.architectures,
+            hybrid_kvcache_ratio=hybrid_kvcache_ratio,
+            context_length=context_length,
+            attention_chunk_size=self.attention_chunk_size,
+        )
+        if self.is_hybrid is not None:
+            self.swa_attention_layer_ids, self.full_attention_layer_ids = (
+                get_hybrid_layer_ids(
+                    self.hf_config.architectures, self.hf_text_config.num_hidden_layers
+                )
+            )
         if enable_multimodal is None:
             mm_disabled_models = [
@@ -264,6 +277,7 @@ class ModelConfig:
             enable_multimodal=server_args.enable_multimodal,
             dtype=server_args.dtype,
             quantization=server_args.quantization,
+            hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
             impl=server_args.impl,
             **kwargs,
         )
@@ -565,6 +579,7 @@ multimodal_model_archs = [
     "CLIPModel",
     "DeepseekVL2ForCausalLM",
     "Gemma3ForConditionalGeneration",
+    "Gemma3nForConditionalGeneration",
     "Grok1VForCausalLM",
     "Grok1AForCausalLM",
     "LlavaLlamaForCausalLM",
@@ -578,6 +593,7 @@ multimodal_model_archs = [
     "Mistral3ForConditionalGeneration",
     "MultiModalityCausalLM",
     "MllamaForConditionalGeneration",
+    "Qwen2AudioForConditionalGeneration",
     "Qwen2VLForConditionalGeneration",
     "Qwen2_5_VLForConditionalGeneration",
     "KimiVLForConditionalGeneration",
@@ -632,3 +648,36 @@ def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
     if scale <= 1:
         return 1.0
     return 0.1 * mscale * math.log(scale) + 1.0
+def is_hybrid_model(
+    model_architectures: List[str],
+    hybrid_kvcache_ratio: Optional[float],
+    context_length: Optional[int],
+    attention_chunk_size: Optional[int],
+):
+    if hybrid_kvcache_ratio is None:
+        return None
+    elif (
+        hybrid_kvcache_ratio > 0
+        and model_architectures[0] == "Llama4ForConditionalGeneration"
+        and context_length > attention_chunk_size
+    ):
+        return hybrid_kvcache_ratio
+    else:
+        return None
+def get_hybrid_layer_ids(model_architectures: List[str], num_hidden_layers: int):
+    if "Llama4ForConditionalGeneration" in model_architectures:
+        swa_attention_layer_ids = [
+            i for i in range(num_hidden_layers) if (i + 1) % 4 != 0
+        ]
+        full_attention_layer_ids = [
+            i for i in range(num_hidden_layers) if (i + 1) % 4 == 0
+        ]
+    else:
+        raise ValueError(
+            "get_hybrid_layer_ids is only implemented for Llama4ForConditionalGeneration"
+        )
+    return swa_attention_layer_ids, full_attention_layer_ids

sglang/srt/configs/update_config.py ADDED Viewed

@@ -0,0 +1,119 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+DEFAULT_MOE_PADDING_SIZE = 32
+if TYPE_CHECKING:
+    from sglang.srt.configs.load_config import LoadConfig
+    from sglang.srt.configs.model_config import ModelConfig
+def may_get_weight_block_size(model_config, load_config):
+    from sglang.srt.model_loader.loader import _get_quantization_config
+    from sglang.srt.model_loader.utils import get_model_architecture
+    model_class, _ = get_model_architecture(model_config)
+    packed_modules_mapping = getattr(model_class, "packed_modules_mapping", {})
+    quant_config = _get_quantization_config(
+        model_config, load_config, packed_modules_mapping
+    )
+    if quant_config is not None and hasattr(quant_config, "weight_block_size"):
+        return getattr(quant_config, "weight_block_size")
+    return None
+def get_moe_padding_size(weight_block_size):
+    if weight_block_size is not None:
+        # See NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
+        assert (
+            len(weight_block_size) == 2
+        ), "Only len(weight_block_size) == 2 is supported"
+        assert (
+            weight_block_size[0] == weight_block_size[1]
+        ), "Only weight_block_size[0] == weight_block_size[1] is supported"
+        return weight_block_size[0]
+    return DEFAULT_MOE_PADDING_SIZE
+def get_num_heads_padding_size(tp_size, weight_block_size):
+    pad_size = (
+        tp_size * 2 if tp_size % 2 == 1 and weight_block_size is not None else tp_size
+    )
+    return pad_size
+def update_intermediate_size(model_config, attr_name, intermediate_padding_size):
+    if hasattr(model_config.hf_config, attr_name):
+        attr_value = getattr(model_config.hf_config, attr_name)
+        if attr_value % intermediate_padding_size != 0:
+            from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
+            attr_value = pad_vocab_size(attr_value, intermediate_padding_size)
+            setattr(model_config.hf_config, attr_name, attr_value)
+            setattr(model_config.hf_text_config, attr_name, attr_value)
+    return model_config
+def adjust_config_with_unaligned_cpu_tp(
+    model_config: ModelConfig, load_config: LoadConfig, tp_size: int
+) -> ModelConfig:
+    # Support the case where the num_attention_heads is not divisible by the TP size.
+    weight_block_size = may_get_weight_block_size(model_config, load_config)
+    model_config.hf_config.original_num_attention_heads = (
+        model_config.num_attention_heads
+    )
+    model_config.hf_text_config.original_num_attention_heads = (
+        model_config.num_attention_heads
+    )
+    model_config.hf_config.original_total_num_kv_heads = (
+        model_config.get_total_num_kv_heads()
+    )
+    model_config.hf_text_config.original_total_num_kv_heads = (
+        model_config.get_total_num_kv_heads()
+    )
+    if (
+        model_config.num_attention_heads % tp_size != 0
+        or model_config.get_total_num_kv_heads() % tp_size != 0
+    ):
+        # Compute the head_dim using the model_config.num_attention_heads before padding
+        if not hasattr(model_config.hf_config, "head_dim"):
+            model_config.hf_config.head_dim = (
+                model_config.hidden_size // model_config.num_attention_heads
+            )
+        query_heads_per_kv = (
+            model_config.num_attention_heads // model_config.get_total_num_kv_heads()
+        )
+        total_kv_heads = model_config.get_total_num_kv_heads()
+        from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
+        pad_size = get_num_heads_padding_size(tp_size, weight_block_size)
+        num_key_value_heads = pad_vocab_size(total_kv_heads, pad_size)
+        model_config.num_key_value_heads = num_key_value_heads
+        model_config.hf_config.num_key_value_heads = num_key_value_heads
+        model_config.hf_text_config.num_key_value_heads = num_key_value_heads
+        num_attention_heads = num_key_value_heads * query_heads_per_kv
+        model_config.num_attention_heads = num_attention_heads
+        model_config.hf_config.num_attention_heads = num_attention_heads
+        model_config.hf_text_config.num_attention_heads = num_attention_heads
+    intermediate_padding_size = tp_size * get_moe_padding_size(weight_block_size)
+    model_config = update_intermediate_size(
+        model_config, "moe_intermediate_size", intermediate_padding_size
+    )
+    model_config = update_intermediate_size(
+        model_config, "intermediate_size", intermediate_padding_size
+    )
+    return model_config

sglang/srt/conversation.py CHANGED Viewed

@@ -59,6 +59,7 @@ class SeparatorStyle(IntEnum):
     METAMATH = auto()
     DeepSeekVL2 = auto()
     QWEN2_VL_EMBED = auto()
+    QWEN2_AUDIO = auto()
     GEMMA3 = auto()
     MPT = auto()
@@ -350,6 +351,23 @@ class Conversation:
                 else:
                     ret += role
             return ret
+        elif self.sep_style == SeparatorStyle.QWEN2_AUDIO:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            counter = 1
+            for role, message in self.messages:
+                if message:
+                    while self.audio_token in message:
+                        message = message.replace(
+                            self.audio_token, self.audio_token.format(idx=counter), 1
+                        )
+                        counter += 1
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+            return ret
         else:
             raise ValueError(f"Invalid style: {self.sep_style}")
@@ -823,6 +841,7 @@ register_conv_template(
         sep_style=SeparatorStyle.GEMMA3,
         stop_str=["<end_of_turn>"],
         image_token="<start_of_image>",
+        audio_token="<start_of_audio>",
     )
 )
@@ -903,6 +922,20 @@ register_conv_template(
 )
+register_conv_template(
+    Conversation(
+        name="qwen2-audio",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are a helpful assistant.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.QWEN2_AUDIO,
+        stop_str=["<|im_end|>"],
+        audio_token="Audio {idx}: <|audio_bos|><|AUDIO|><|audio_eos|>\n",
+    )
+)
 @register_conv_template_matching_function
 def match_internvl(model_path: str):
     if re.search(r"internvl2_5", model_path, re.IGNORECASE):
@@ -955,6 +988,8 @@ def match_qwen_chat_ml(model_path: str):
         return "gme-qwen2-vl"
     if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
         return "qwen2-vl"
+    if re.search(r"qwen.*audio", model_path, re.IGNORECASE):
+        return "qwen2-audio"
     if re.search(
         r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
         model_path,

sglang/srt/custom_op.py CHANGED Viewed

@@ -1,11 +1,12 @@
 from torch import nn
-from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_hip
+from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_hip, is_npu
 _is_cuda = is_cuda()
 _is_hip = is_hip()
 _is_cpu = is_cpu()
 _is_cpu_amx_available = cpu_has_amx_support()
+_is_npu = is_npu()
 class CustomOp(nn.Module):
@@ -60,6 +61,9 @@ class CustomOp(nn.Module):
     def forward_cuda(self, *args, **kwargs):
         raise NotImplementedError
+    def forward_npu(self, *args, **kwargs):
+        raise NotImplementedError
     def forward_hip(self, *args, **kwargs):
         return self.forward_cuda(*args, **kwargs)
@@ -79,5 +83,7 @@ class CustomOp(nn.Module):
             return self.forward_hip
         elif _is_cpu and _is_cpu_amx_available:
             return self.forward_cpu
+        elif _is_npu:
+            return self.forward_npu
         else:
             return self.forward_native

sglang/srt/disaggregation/base/conn.py CHANGED Viewed

@@ -27,6 +27,8 @@ class KVArgs:
     decode_tp_size: int
     # for pp prefill
     prefill_pp_size: int
+    kv_head_num: int
+    page_size: int
 class KVPoll:

sglang 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl

sglang 0.4.8py3-none-any.whl → 0.4.9py3-none-any.whl