PyPI - sglang - Versions diffs - 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl - Mend

sglang 0.4.8.post1py3-none-any.whl → 0.4.9.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

sglang/bench_one_batch_server.py +17 -2
sglang/bench_serving.py +170 -24
sglang/srt/configs/internvl.py +4 -2
sglang/srt/configs/janus_pro.py +1 -1
sglang/srt/configs/model_config.py +60 -1
sglang/srt/configs/update_config.py +119 -0
sglang/srt/conversation.py +69 -1
sglang/srt/disaggregation/decode.py +21 -5
sglang/srt/disaggregation/mooncake/conn.py +35 -4
sglang/srt/disaggregation/nixl/conn.py +6 -6
sglang/srt/disaggregation/prefill.py +2 -2
sglang/srt/disaggregation/utils.py +1 -1
sglang/srt/distributed/parallel_state.py +44 -17
sglang/srt/entrypoints/EngineBase.py +8 -0
sglang/srt/entrypoints/engine.py +40 -6
sglang/srt/entrypoints/http_server.py +111 -24
sglang/srt/entrypoints/http_server_engine.py +1 -1
sglang/srt/entrypoints/openai/protocol.py +4 -2
sglang/srt/eplb/__init__.py +0 -0
sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
sglang/srt/{managers → eplb}/expert_distribution.py +1 -5
sglang/srt/{managers → eplb}/expert_location.py +1 -1
sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
sglang/srt/hf_transformers_utils.py +2 -1
sglang/srt/layers/activation.py +2 -2
sglang/srt/layers/amx_utils.py +86 -0
sglang/srt/layers/attention/ascend_backend.py +219 -0
sglang/srt/layers/attention/flashattention_backend.py +32 -9
sglang/srt/layers/attention/tbo_backend.py +37 -9
sglang/srt/layers/communicator.py +20 -2
sglang/srt/layers/dp_attention.py +9 -3
sglang/srt/layers/elementwise.py +76 -12
sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
sglang/srt/layers/layernorm.py +26 -0
sglang/srt/layers/linear.py +84 -14
sglang/srt/layers/logits_processor.py +4 -4
sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
sglang/srt/layers/moe/ep_moe/kernels.py +81 -8
sglang/srt/layers/moe/ep_moe/layer.py +176 -15
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +23 -17
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +211 -74
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
sglang/srt/layers/moe/router.py +60 -22
sglang/srt/layers/moe/topk.py +10 -28
sglang/srt/layers/parameter.py +67 -7
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
sglang/srt/layers/quantization/fp8.py +72 -7
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -2
sglang/srt/layers/quantization/gptq.py +5 -1
sglang/srt/layers/quantization/modelopt_quant.py +244 -1
sglang/srt/layers/quantization/moe_wna16.py +1 -1
sglang/srt/layers/quantization/quant_utils.py +166 -0
sglang/srt/layers/quantization/w4afp8.py +264 -0
sglang/srt/layers/quantization/w8a8_int8.py +52 -1
sglang/srt/layers/rotary_embedding.py +2 -2
sglang/srt/layers/vocab_parallel_embedding.py +20 -10
sglang/srt/lora/lora.py +4 -5
sglang/srt/lora/lora_manager.py +73 -20
sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
sglang/srt/managers/cache_controller.py +41 -195
sglang/srt/managers/configure_logging.py +1 -1
sglang/srt/managers/io_struct.py +58 -14
sglang/srt/managers/mm_utils.py +77 -61
sglang/srt/managers/multimodal_processor.py +2 -6
sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
sglang/srt/managers/schedule_batch.py +78 -85
sglang/srt/managers/scheduler.py +130 -64
sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
sglang/srt/managers/session_controller.py +12 -3
sglang/srt/managers/tokenizer_manager.py +314 -103
sglang/srt/managers/tp_worker.py +13 -1
sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
sglang/srt/mem_cache/allocator.py +290 -0
sglang/srt/mem_cache/chunk_cache.py +34 -2
sglang/srt/mem_cache/hiradix_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +402 -66
sglang/srt/mem_cache/memory_pool_host.py +6 -109
sglang/srt/mem_cache/multimodal_cache.py +3 -0
sglang/srt/mem_cache/radix_cache.py +8 -4
sglang/srt/model_executor/cuda_graph_runner.py +2 -1
sglang/srt/model_executor/forward_batch_info.py +17 -4
sglang/srt/model_executor/model_runner.py +297 -56
sglang/srt/model_loader/loader.py +41 -0
sglang/srt/model_loader/weight_utils.py +72 -4
sglang/srt/models/deepseek_nextn.py +1 -3
sglang/srt/models/deepseek_v2.py +195 -45
sglang/srt/models/deepseek_vl2.py +3 -5
sglang/srt/models/gemma3_causal.py +1 -2
sglang/srt/models/gemma3n_causal.py +4 -3
sglang/srt/models/gemma3n_mm.py +4 -20
sglang/srt/models/hunyuan.py +1 -1
sglang/srt/models/kimi_vl.py +1 -2
sglang/srt/models/llama.py +10 -4
sglang/srt/models/llama4.py +32 -45
sglang/srt/models/llama_eagle3.py +61 -11
sglang/srt/models/llava.py +5 -5
sglang/srt/models/minicpmo.py +2 -2
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mllama4.py +402 -89
sglang/srt/models/phi4mm.py +1 -3
sglang/srt/models/pixtral.py +3 -7
sglang/srt/models/qwen2.py +31 -3
sglang/srt/models/qwen2_5_vl.py +1 -3
sglang/srt/models/qwen2_audio.py +200 -0
sglang/srt/models/qwen2_moe.py +32 -6
sglang/srt/models/qwen2_vl.py +1 -4
sglang/srt/models/qwen3.py +94 -25
sglang/srt/models/qwen3_moe.py +68 -21
sglang/srt/models/vila.py +3 -8
sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +2 -2
sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +140 -158
sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3n.py +5 -20
sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +65 -66
sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
sglang/srt/operations_strategy.py +6 -2
sglang/srt/reasoning_parser.py +26 -0
sglang/srt/sampling/sampling_batch_info.py +39 -1
sglang/srt/server_args.py +84 -22
sglang/srt/speculative/build_eagle_tree.py +57 -18
sglang/srt/speculative/eagle_worker.py +6 -4
sglang/srt/two_batch_overlap.py +203 -27
sglang/srt/utils.py +343 -163
sglang/srt/warmup.py +12 -3
sglang/test/runners.py +10 -1
sglang/test/test_cutlass_w4a8_moe.py +281 -0
sglang/test/test_utils.py +15 -3
sglang/utils.py +5 -5
sglang/version.py +1 -1
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +12 -8
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +157 -146
sglang/math_utils.py +0 -8
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
/sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0

sglang/bench_one_batch_server.py CHANGED Viewed

@@ -38,6 +38,7 @@ class BenchArgs:
     output_len: Tuple[int] = (16,)
     temperature: float = 0.0
     return_logprob: bool = False
+    client_stream_interval: int = 1
     input_len_step_percentage: float = 0.0
     result_filename: str = "result.jsonl"
     base_url: str = ""
@@ -60,6 +61,11 @@ class BenchArgs:
         )
         parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
         parser.add_argument("--return-logprob", action="store_true")
+        parser.add_argument(
+            "--client-stream-interval",
+            type=int,
+            default=BenchArgs.client_stream_interval,
+        )
         parser.add_argument(
             "--input-len-step-percentage",
             type=float,
@@ -120,6 +126,7 @@ def run_one_case(
     output_len: int,
     temperature: float,
     return_logprob: bool,
+    stream_interval: int,
     input_len_step_percentage: float,
     run_name: str,
     result_filename: str,
@@ -168,6 +175,7 @@ def run_one_case(
                 "max_new_tokens": output_len,
                 "ignore_eos": True,
                 "json_schema": json_schema,
+                "stream_interval": stream_interval,
             },
             "return_logprob": return_logprob,
             "stream": True,
@@ -245,8 +253,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
     else:
         proc, base_url = launch_server_process(server_args)
-    tokenizer_id = server_args.tokenizer_path or server_args.model_path
-    tokenizer = get_tokenizer(tokenizer_id)
+    server_info = requests.get(base_url + "/get_server_info").json()
+    if "tokenizer_path" in server_info:
+        tokenizer_path = server_info["tokenizer_path"]
+    elif "prefill" in server_info:
+        tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
+    tokenizer = get_tokenizer(tokenizer_path)
     # warmup
     if not bench_args.skip_warmup:
@@ -258,6 +270,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
             output_len=16,
             temperature=bench_args.temperature,
             return_logprob=bench_args.return_logprob,
+            stream_interval=bench_args.client_stream_interval,
             input_len_step_percentage=bench_args.input_len_step_percentage,
             run_name="",
             result_filename="",
@@ -280,6 +293,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                     ol,
                     temperature=bench_args.temperature,
                     return_logprob=bench_args.return_logprob,
+                    stream_interval=bench_args.client_stream_interval,
                     input_len_step_percentage=bench_args.input_len_step_percentage,
                     run_name=bench_args.run_name,
                     result_filename=bench_args.result_filename,
@@ -301,6 +315,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
                                 ol,
                                 temperature=bench_args.temperature,
                                 return_logprob=bench_args.return_logprob,
+                                stream_interval=bench_args.client_stream_interval,
                                 input_len_step_percentage=bench_args.input_len_step_percentage,
                                 run_name=bench_args.run_name,
                                 result_filename=bench_args.result_filename,

sglang/bench_serving.py CHANGED Viewed

@@ -265,6 +265,138 @@ async def async_request_openai_completions(
     return output
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Makes a request to the OpenAI Chat Completions API.
+    Handles both streaming and non-streaming responses, including support
+    for image data in messages. Calculates and returns various performance
+    metrics.
+    Args:
+        request_func_input: Input parameters for the request.
+        pbar: Optional tqdm progress bar to update.
+    Returns:
+        RequestFuncOutput: Output of the request, including generated text,
+                           latency, TTFT, ITL, and success status.
+    """
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
+    if request_func_input.image_data:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": request_func_input.image_data},
+                    },
+                    {"type": "text", "text": request_func_input.prompt},
+                ],
+            },
+        ]
+    else:
+        messages = [{"role": "user", "content": request_func_input.prompt}]
+    async with _create_bench_client_session() as session:
+        payload = {
+            "model": request_func_input.model,
+            "messages": messages,
+            "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": not args.disable_stream,
+            **request_func_input.extra_request_body,
+        }
+        headers = get_auth_headers()
+        output = RequestFuncOutput.init_new(request_func_input)
+        generated_text = ""
+        output_len = request_func_input.output_len
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    if args.disable_stream:
+                        # Non-streaming response
+                        response_json = await response.json()
+                        output.generated_text = response_json["choices"][0]["message"][
+                            "content"
+                        ]
+                        output.success = True
+                        output.latency = time.perf_counter() - st
+                        output.ttft = (
+                            output.latency
+                        )  # For non-streaming, TTFT = total latency
+                        output.output_len = response_json.get("usage", {}).get(
+                            "completion_tokens", output_len
+                        )
+                    else:
+                        # Streaming response
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+                            chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                            latency = time.perf_counter() - st
+                            if chunk == "[DONE]":
+                                pass
+                            else:
+                                data = json.loads(chunk)
+                                # Check if this chunk contains content
+                                delta = data.get("choices", [{}])[0].get("delta", {})
+                                content = delta.get("content", "")
+                                if content:
+                                    timestamp = time.perf_counter()
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp
+                                        )
+                                    most_recent_timestamp = timestamp
+                                    generated_text += content
+                                # Check for usage info in final chunk
+                                output_len = (data.get("usage") or {}).get(
+                                    "completion_tokens", output_len
+                                )
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = latency
+                        output.output_len = output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+    if pbar:
+        pbar.update(1)
+    return output
 async def async_request_truss(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
@@ -544,6 +676,7 @@ def get_dataset(args, tokenizer):
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             fixed_output_len=args.random_output_len,
+            apply_chat_template=args.apply_chat_template,
             random_sample=True,
         )
     else:
@@ -555,8 +688,11 @@ ASYNC_REQUEST_FUNCS = {
     "sglang": async_request_sglang_generate,
     "sglang-native": async_request_sglang_generate,
     "sglang-oai": async_request_openai_completions,
+    "sglang-oai-chat": async_request_openai_chat_completions,
     "vllm": async_request_openai_completions,
+    "vllm-chat": async_request_openai_chat_completions,
     "lmdeploy": async_request_openai_completions,
+    "lmdeploy-chat": async_request_openai_chat_completions,
     "trt": async_request_trt_llm,
     "gserver": async_request_gserver,
     "truss": async_request_truss,
@@ -661,6 +797,7 @@ def sample_mmmu_requests(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int] = None,
+    apply_chat_template: bool = True,
     random_sample: bool = True,
 ) -> List[DatasetRow]:
     """
@@ -670,15 +807,16 @@ def sample_mmmu_requests(
         num_requests: Number of requests to sample.
         tokenizer: Tokenizer to use for token counting.
         fixed_output_len: If provided, use this fixed output length for all requests.
+        apply_chat_template: Whether to apply the chat template to the prompt.
         random_sample: Whether to randomly sample or take the first N.
     Returns:
         List of tuples (prompt, prompt_token_len, output_token_len).
     """
     try:
-        import base64
         import io
+        import pybase64
         from datasets import load_dataset
     except ImportError:
         raise ImportError("Please install datasets: pip install datasets")
@@ -729,7 +867,7 @@ def sample_mmmu_requests(
                     # Encode image to base64
                     buffered = io.BytesIO()
                     image.save(buffered, format="JPEG")
-                    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                    img_str = pybase64.b64encode(buffered.getvalue()).decode("utf-8")
                     image_data = f"data:image/jpeg;base64,{img_str}"
                 else:
                     continue
@@ -739,28 +877,30 @@ def sample_mmmu_requests(
                 # Construct the prompt
                 prompt = f"Question: {question}\n\nAnswer: "
-                try:
-                    prompt = tokenizer.apply_chat_template(
-                        [
-                            {
-                                "role": "user",
-                                "content": [
-                                    {
-                                        "type": "image_url",
-                                        "image_url": {"url": image_data},
-                                    },
-                                    {"type": "text", "text": prompt},
-                                ],
-                            }
-                        ],
-                        add_generation_prompt=True,
-                        tokenize=False,
-                    )
-                except Exception as e:
-                    # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
-                    print(f"Error applying chat template: {e}, fallback to <image> tag")
-                    prompt = f"<image>{prompt}"
+                if apply_chat_template:
+                    try:
+                        prompt = tokenizer.apply_chat_template(
+                            [
+                                {
+                                    "role": "user",
+                                    "content": [
+                                        {
+                                            "type": "image_url",
+                                            "image_url": {"url": image_data},
+                                        },
+                                        {"type": "text", "text": prompt},
+                                    ],
+                                }
+                            ],
+                            add_generation_prompt=True,
+                            tokenize=False,
+                        )
+                    except Exception as e:
+                        # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
+                        print(
+                            f"Error applying chat template: {e}, fallback to <image> tag"
+                        )
+                        prompt = f"<image>{prompt}"
                 # Calculate token lengths for text only (without image data)
                 prompt_token_ids = tokenizer.encode(prompt)
@@ -1544,6 +1684,12 @@ def run_benchmark(args_: argparse.Namespace):
             if args.base_url
             else f"http://{args.host}:{args.port}/v1/completions"
         )
+    elif args.backend in ["sglang-oai-chat", "vllm-chat", "lmdeploy-chat"]:
+        api_url = (
+            f"{args.base_url}/v1/chat/completions"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v1/chat/completions"
+        )
     elif args.backend == "trt":
         api_url = (
             f"{args.base_url}/v2/models/ensemble/generate_stream"

sglang/srt/configs/internvl.py CHANGED Viewed

@@ -147,12 +147,14 @@ class InternLM2Config(PretrainedConfig):
             )
         if (
             rope_scaling_factor is None
-            or not isinstance(rope_scaling_factor, float)
+            or not isinstance(rope_scaling_factor, (float, int))
             or rope_scaling_factor < 1.0
         ):
             raise ValueError(
-                f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}"
+                f"`rope_scaling`'s factor field must be a float|int >= 1, got {rope_scaling_factor=}, {type(rope_scaling_factor)=}"
             )
+        if isinstance(rope_scaling_factor, int):
+            rope_scaling_factor = float(rope_scaling_factor)
 class InternVisionConfig(PretrainedConfig):

sglang/srt/configs/janus_pro.py CHANGED Viewed

@@ -19,7 +19,7 @@ from transformers import (
 from transformers.image_utils import to_numpy_array
 from sglang.srt.configs.utils import register_image_processor, register_processor
-from sglang.srt.mm_utils import expand2square
+from sglang.srt.multimodal.mm_utils import expand2square
 class DictToObject(dict):

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -59,6 +59,7 @@ class ModelConfig:
         quantization: Optional[str] = None,
         override_config_file: Optional[str] = None,
         is_draft_model: bool = False,
+        hybrid_kvcache_ratio: Optional[float] = None,
         impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
@@ -86,6 +87,18 @@ class ModelConfig:
         self.attention_chunk_size = getattr(
             self.hf_text_config, "attention_chunk_size", None
         )
+        self.is_hybrid = is_hybrid_model(
+            self.hf_config.architectures,
+            hybrid_kvcache_ratio=hybrid_kvcache_ratio,
+            context_length=context_length,
+            attention_chunk_size=self.attention_chunk_size,
+        )
+        if self.is_hybrid is not None:
+            self.swa_attention_layer_ids, self.full_attention_layer_ids = (
+                get_hybrid_layer_ids(
+                    self.hf_config.architectures, self.hf_text_config.num_hidden_layers
+                )
+            )
         if enable_multimodal is None:
             mm_disabled_models = [
@@ -264,6 +277,7 @@ class ModelConfig:
             enable_multimodal=server_args.enable_multimodal,
             dtype=server_args.dtype,
             quantization=server_args.quantization,
+            hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
             impl=server_args.impl,
             **kwargs,
         )
@@ -345,7 +359,17 @@ class ModelConfig:
                 if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
                     quant_cfg = modelopt_quant_config
             elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
-                quant_cfg = modelopt_quant_config
+                quant_config_file = os.path.join(
+                    self.model_path, "hf_quant_config.json"
+                )
+                with open(quant_config_file) as f:
+                    quant_config_dict = json.load(f)
+                json_quant_configs = quant_config_dict["quantization"]
+                quant_algo = json_quant_configs.get("quant_algo", None)
+                if quant_algo == "MIXED_PRECISION":
+                    quant_cfg = {"quant_method": "w4afp8"}
+                else:
+                    quant_cfg = modelopt_quant_config
         return quant_cfg
     # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
@@ -375,6 +399,7 @@ class ModelConfig:
             "w8a8_fp8",
             "moe_wna16",
             "qoq",
+            "w4afp8",
         ]
         compatible_quantization_methods = {
             "modelopt_fp4": ["modelopt"],
@@ -579,6 +604,7 @@ multimodal_model_archs = [
     "Mistral3ForConditionalGeneration",
     "MultiModalityCausalLM",
     "MllamaForConditionalGeneration",
+    "Qwen2AudioForConditionalGeneration",
     "Qwen2VLForConditionalGeneration",
     "Qwen2_5_VLForConditionalGeneration",
     "KimiVLForConditionalGeneration",
@@ -633,3 +659,36 @@ def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
     if scale <= 1:
         return 1.0
     return 0.1 * mscale * math.log(scale) + 1.0
+def is_hybrid_model(
+    model_architectures: List[str],
+    hybrid_kvcache_ratio: Optional[float],
+    context_length: Optional[int],
+    attention_chunk_size: Optional[int],
+):
+    if hybrid_kvcache_ratio is None:
+        return None
+    elif (
+        hybrid_kvcache_ratio > 0
+        and model_architectures[0] == "Llama4ForConditionalGeneration"
+        and context_length > attention_chunk_size
+    ):
+        return hybrid_kvcache_ratio
+    else:
+        return None
+def get_hybrid_layer_ids(model_architectures: List[str], num_hidden_layers: int):
+    if "Llama4ForConditionalGeneration" in model_architectures:
+        swa_attention_layer_ids = [
+            i for i in range(num_hidden_layers) if (i + 1) % 4 != 0
+        ]
+        full_attention_layer_ids = [
+            i for i in range(num_hidden_layers) if (i + 1) % 4 == 0
+        ]
+    else:
+        raise ValueError(
+            "get_hybrid_layer_ids is only implemented for Llama4ForConditionalGeneration"
+        )
+    return swa_attention_layer_ids, full_attention_layer_ids

sglang/srt/configs/update_config.py ADDED Viewed

@@ -0,0 +1,119 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+DEFAULT_MOE_PADDING_SIZE = 32
+if TYPE_CHECKING:
+    from sglang.srt.configs.load_config import LoadConfig
+    from sglang.srt.configs.model_config import ModelConfig
+def may_get_weight_block_size(model_config, load_config):
+    from sglang.srt.model_loader.loader import _get_quantization_config
+    from sglang.srt.model_loader.utils import get_model_architecture
+    model_class, _ = get_model_architecture(model_config)
+    packed_modules_mapping = getattr(model_class, "packed_modules_mapping", {})
+    quant_config = _get_quantization_config(
+        model_config, load_config, packed_modules_mapping
+    )
+    if quant_config is not None and hasattr(quant_config, "weight_block_size"):
+        return getattr(quant_config, "weight_block_size")
+    return None
+def get_moe_padding_size(weight_block_size):
+    if weight_block_size is not None:
+        # See NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
+        assert (
+            len(weight_block_size) == 2
+        ), "Only len(weight_block_size) == 2 is supported"
+        assert (
+            weight_block_size[0] == weight_block_size[1]
+        ), "Only weight_block_size[0] == weight_block_size[1] is supported"
+        return weight_block_size[0]
+    return DEFAULT_MOE_PADDING_SIZE
+def get_num_heads_padding_size(tp_size, weight_block_size):
+    pad_size = (
+        tp_size * 2 if tp_size % 2 == 1 and weight_block_size is not None else tp_size
+    )
+    return pad_size
+def update_intermediate_size(model_config, attr_name, intermediate_padding_size):
+    if hasattr(model_config.hf_config, attr_name):
+        attr_value = getattr(model_config.hf_config, attr_name)
+        if attr_value % intermediate_padding_size != 0:
+            from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
+            attr_value = pad_vocab_size(attr_value, intermediate_padding_size)
+            setattr(model_config.hf_config, attr_name, attr_value)
+            setattr(model_config.hf_text_config, attr_name, attr_value)
+    return model_config
+def adjust_config_with_unaligned_cpu_tp(
+    model_config: ModelConfig, load_config: LoadConfig, tp_size: int
+) -> ModelConfig:
+    # Support the case where the num_attention_heads is not divisible by the TP size.
+    weight_block_size = may_get_weight_block_size(model_config, load_config)
+    model_config.hf_config.original_num_attention_heads = (
+        model_config.num_attention_heads
+    )
+    model_config.hf_text_config.original_num_attention_heads = (
+        model_config.num_attention_heads
+    )
+    model_config.hf_config.original_total_num_kv_heads = (
+        model_config.get_total_num_kv_heads()
+    )
+    model_config.hf_text_config.original_total_num_kv_heads = (
+        model_config.get_total_num_kv_heads()
+    )
+    if (
+        model_config.num_attention_heads % tp_size != 0
+        or model_config.get_total_num_kv_heads() % tp_size != 0
+    ):
+        # Compute the head_dim using the model_config.num_attention_heads before padding
+        if not hasattr(model_config.hf_config, "head_dim"):
+            model_config.hf_config.head_dim = (
+                model_config.hidden_size // model_config.num_attention_heads
+            )
+        query_heads_per_kv = (
+            model_config.num_attention_heads // model_config.get_total_num_kv_heads()
+        )
+        total_kv_heads = model_config.get_total_num_kv_heads()
+        from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
+        pad_size = get_num_heads_padding_size(tp_size, weight_block_size)
+        num_key_value_heads = pad_vocab_size(total_kv_heads, pad_size)
+        model_config.num_key_value_heads = num_key_value_heads
+        model_config.hf_config.num_key_value_heads = num_key_value_heads
+        model_config.hf_text_config.num_key_value_heads = num_key_value_heads
+        num_attention_heads = num_key_value_heads * query_heads_per_kv
+        model_config.num_attention_heads = num_attention_heads
+        model_config.hf_config.num_attention_heads = num_attention_heads
+        model_config.hf_text_config.num_attention_heads = num_attention_heads
+    intermediate_padding_size = tp_size * get_moe_padding_size(weight_block_size)
+    model_config = update_intermediate_size(
+        model_config, "moe_intermediate_size", intermediate_padding_size
+    )
+    model_config = update_intermediate_size(
+        model_config, "intermediate_size", intermediate_padding_size
+    )
+    return model_config

sglang 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl

sglang 0.4.8.post1py3-none-any.whl → 0.4.9.post1py3-none-any.whl