sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +23 -3
- sglang/srt/configs/deepseekvl2.py +10 -1
- sglang/srt/configs/model_config.py +5 -16
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
- sglang/srt/distributed/parallel_state.py +32 -5
- sglang/srt/entrypoints/http_server.py +7 -1
- sglang/srt/entrypoints/verl_engine.py +2 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/attention/flashattention_backend.py +218 -79
- sglang/srt/layers/dp_attention.py +12 -1
- sglang/srt/layers/moe/topk.py +30 -3
- sglang/srt/layers/quantization/__init__.py +134 -165
- sglang/srt/layers/quantization/awq.py +200 -0
- sglang/srt/layers/quantization/fp8_kernel.py +2 -1
- sglang/srt/layers/quantization/gptq.py +30 -40
- sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
- sglang/srt/layers/rotary_embedding.py +12 -0
- sglang/srt/lora/backend/base_backend.py +4 -4
- sglang/srt/lora/backend/flashinfer_backend.py +12 -9
- sglang/srt/lora/backend/triton_backend.py +5 -8
- sglang/srt/lora/layers.py +19 -33
- sglang/srt/lora/lora_manager.py +20 -7
- sglang/srt/lora/mem_pool.py +12 -6
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
- sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
- sglang/srt/lora/utils.py +6 -0
- sglang/srt/managers/io_struct.py +4 -2
- sglang/srt/managers/multimodal_processors/clip.py +63 -0
- sglang/srt/managers/schedule_batch.py +1 -0
- sglang/srt/managers/scheduler.py +25 -19
- sglang/srt/managers/tokenizer_manager.py +0 -1
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/model_executor/cuda_graph_runner.py +9 -8
- sglang/srt/model_executor/model_runner.py +9 -6
- sglang/srt/model_loader/loader.py +11 -1
- sglang/srt/model_loader/weight_utils.py +6 -3
- sglang/srt/models/clip.py +563 -0
- sglang/srt/models/deepseek_janus_pro.py +2 -2
- sglang/srt/models/deepseek_v2.py +151 -26
- sglang/srt/models/gemma3_causal.py +12 -2
- sglang/srt/models/gemma3_mm.py +6 -0
- sglang/srt/openai_api/adapter.py +88 -87
- sglang/srt/openai_api/protocol.py +10 -5
- sglang/srt/patch_torch.py +71 -0
- sglang/srt/server_args.py +21 -11
- sglang/srt/speculative/eagle_worker.py +1 -1
- sglang/srt/utils.py +33 -0
- sglang/test/runners.py +27 -2
- sglang/test/test_utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +8 -4
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +57 -53
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +0 -0
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0
| @@ -28,6 +28,7 @@ class ModelCard(BaseModel): | |
| 28 28 | 
             
                created: int = Field(default_factory=lambda: int(time.time()))
         | 
| 29 29 | 
             
                owned_by: str = "sglang"
         | 
| 30 30 | 
             
                root: Optional[str] = None
         | 
| 31 | 
            +
                max_model_len: Optional[int] = None
         | 
| 31 32 |  | 
| 32 33 |  | 
| 33 34 | 
             
            class ModelList(BaseModel):
         | 
| @@ -187,7 +188,7 @@ class CompletionResponseChoice(BaseModel): | |
| 187 188 | 
             
                index: int
         | 
| 188 189 | 
             
                text: str
         | 
| 189 190 | 
             
                logprobs: Optional[LogProbs] = None
         | 
| 190 | 
            -
                finish_reason:  | 
| 191 | 
            +
                finish_reason: Literal["stop", "length", "content_filter"]
         | 
| 191 192 | 
             
                matched_stop: Union[None, int, str] = None
         | 
| 192 193 |  | 
| 193 194 |  | 
| @@ -204,7 +205,7 @@ class CompletionResponseStreamChoice(BaseModel): | |
| 204 205 | 
             
                index: int
         | 
| 205 206 | 
             
                text: str
         | 
| 206 207 | 
             
                logprobs: Optional[LogProbs] = None
         | 
| 207 | 
            -
                finish_reason: Optional[ | 
| 208 | 
            +
                finish_reason: Optional[Literal["stop", "length", "content_filter"]] = None
         | 
| 208 209 | 
             
                matched_stop: Union[None, int, str] = None
         | 
| 209 210 |  | 
| 210 211 |  | 
| @@ -322,7 +323,7 @@ class ChatCompletionRequest(BaseModel): | |
| 322 323 | 
             
                max_tokens: Optional[int] = None
         | 
| 323 324 | 
             
                n: int = 1
         | 
| 324 325 | 
             
                presence_penalty: float = 0.0
         | 
| 325 | 
            -
                response_format: Union[ResponseFormat, StructuralTagResponseFormat] = None
         | 
| 326 | 
            +
                response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
         | 
| 326 327 | 
             
                seed: Optional[int] = None
         | 
| 327 328 | 
             
                stop: Optional[Union[str, List[str]]] = None
         | 
| 328 329 | 
             
                stream: bool = False
         | 
| @@ -387,7 +388,9 @@ class ChatCompletionResponseChoice(BaseModel): | |
| 387 388 | 
             
                index: int
         | 
| 388 389 | 
             
                message: ChatMessage
         | 
| 389 390 | 
             
                logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
         | 
| 390 | 
            -
                finish_reason:  | 
| 391 | 
            +
                finish_reason: Literal[
         | 
| 392 | 
            +
                    "stop", "length", "tool_calls", "content_filter", "function_call"
         | 
| 393 | 
            +
                ]
         | 
| 391 394 | 
             
                matched_stop: Union[None, int, str] = None
         | 
| 392 395 |  | 
| 393 396 |  | 
| @@ -411,7 +414,9 @@ class ChatCompletionResponseStreamChoice(BaseModel): | |
| 411 414 | 
             
                index: int
         | 
| 412 415 | 
             
                delta: DeltaMessage
         | 
| 413 416 | 
             
                logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
         | 
| 414 | 
            -
                finish_reason: Optional[ | 
| 417 | 
            +
                finish_reason: Optional[
         | 
| 418 | 
            +
                    Literal["stop", "length", "tool_calls", "content_filter", "function_call"]
         | 
| 419 | 
            +
                ] = None
         | 
| 415 420 | 
             
                matched_stop: Union[None, int, str] = None
         | 
| 416 421 |  | 
| 417 422 |  | 
| @@ -0,0 +1,71 @@ | |
| 1 | 
            +
            # Copyright 2023-2024 SGLang Team
         | 
| 2 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 3 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 4 | 
            +
            # You may obtain a copy of the License at
         | 
| 5 | 
            +
            #
         | 
| 6 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 9 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 10 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 11 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 12 | 
            +
            # limitations under the License.
         | 
| 13 | 
            +
            # ==============================================================================
         | 
| 14 | 
            +
            from typing import Callable, Union
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            import torch
         | 
| 17 | 
            +
            from torch.multiprocessing import reductions
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            def monkey_patch_torch_reductions():
         | 
| 21 | 
            +
                """Monkey patching before Torch https://github.com/pytorch/pytorch/pull/149248 is fixed"""
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                if hasattr(reductions, "_reduce_tensor_original"):
         | 
| 24 | 
            +
                    return
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                reductions._reduce_tensor_original = reductions.reduce_tensor
         | 
| 27 | 
            +
                reductions._rebuild_cuda_tensor_original = reductions.rebuild_cuda_tensor
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                reductions.reduce_tensor = _reduce_tensor_modified
         | 
| 30 | 
            +
                reductions.rebuild_cuda_tensor = _rebuild_cuda_tensor_modified
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                reductions.init_reductions()
         | 
| 33 | 
            +
             | 
| 34 | 
            +
             | 
| 35 | 
            +
            # The signature has not been changed for years, and we will not need this when the next version is released,
         | 
| 36 | 
            +
            # so it looks safe to use a constant.
         | 
| 37 | 
            +
            _REDUCE_TENSOR_ARG_DEVICE_INDEX = 6
         | 
| 38 | 
            +
             | 
| 39 | 
            +
             | 
| 40 | 
            +
            def _reduce_tensor_modified(*args, **kwargs):
         | 
| 41 | 
            +
                output_fn, output_args = reductions._reduce_tensor_original(*args, **kwargs)
         | 
| 42 | 
            +
                output_args = _modify_tuple(
         | 
| 43 | 
            +
                    output_args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_to_uuid
         | 
| 44 | 
            +
                )
         | 
| 45 | 
            +
                return output_fn, output_args
         | 
| 46 | 
            +
             | 
| 47 | 
            +
             | 
| 48 | 
            +
            def _rebuild_cuda_tensor_modified(*args):
         | 
| 49 | 
            +
                args = _modify_tuple(args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_from_maybe_uuid)
         | 
| 50 | 
            +
                return reductions._rebuild_cuda_tensor_original(*args)
         | 
| 51 | 
            +
             | 
| 52 | 
            +
             | 
| 53 | 
            +
            def _device_to_uuid(device: int) -> str:
         | 
| 54 | 
            +
                return str(torch.cuda.get_device_properties(device).uuid)
         | 
| 55 | 
            +
             | 
| 56 | 
            +
             | 
| 57 | 
            +
            def _device_from_maybe_uuid(device_maybe_uuid: Union[int, str]) -> int:
         | 
| 58 | 
            +
                if isinstance(device_maybe_uuid, int):
         | 
| 59 | 
            +
                    return device_maybe_uuid
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                if isinstance(device_maybe_uuid, str):
         | 
| 62 | 
            +
                    for device in range(torch.cuda.device_count()):
         | 
| 63 | 
            +
                        if str(torch.cuda.get_device_properties(device).uuid) == device_maybe_uuid:
         | 
| 64 | 
            +
                            return device
         | 
| 65 | 
            +
                    raise Exception("Invalid device_uuid=" + device_maybe_uuid)
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                raise Exception(f"Unknown type: {device_maybe_uuid=}")
         | 
| 68 | 
            +
             | 
| 69 | 
            +
             | 
| 70 | 
            +
            def _modify_tuple(t, index: int, modifier: Callable):
         | 
| 71 | 
            +
                return *t[:index], modifier(t[index]), *t[index + 1 :]
         | 
    
        sglang/srt/server_args.py
    CHANGED
    
    | @@ -24,6 +24,7 @@ from typing import List, Optional | |
| 24 24 | 
             
            from sglang.srt.hf_transformers_utils import check_gguf_file
         | 
| 25 25 | 
             
            from sglang.srt.reasoning_parser import ReasoningParser
         | 
| 26 26 | 
             
            from sglang.srt.utils import (
         | 
| 27 | 
            +
                configure_ipv6,
         | 
| 27 28 | 
             
                get_amdgpu_memory_capacity,
         | 
| 28 29 | 
             
                get_device,
         | 
| 29 30 | 
             
                get_hpu_memory_capacity,
         | 
| @@ -52,7 +53,7 @@ class ServerArgs: | |
| 52 53 | 
             
                dtype: str = "auto"
         | 
| 53 54 | 
             
                kv_cache_dtype: str = "auto"
         | 
| 54 55 | 
             
                quantization: Optional[str] = None
         | 
| 55 | 
            -
                quantization_param_path:  | 
| 56 | 
            +
                quantization_param_path: Optional[str] = None
         | 
| 56 57 | 
             
                context_length: Optional[int] = None
         | 
| 57 58 | 
             
                device: Optional[str] = None
         | 
| 58 59 | 
             
                served_model_name: Optional[str] = None
         | 
| @@ -140,7 +141,7 @@ class ServerArgs: | |
| 140 141 |  | 
| 141 142 | 
             
                # Double Sparsity
         | 
| 142 143 | 
             
                enable_double_sparsity: bool = False
         | 
| 143 | 
            -
                ds_channel_config_path: str = None
         | 
| 144 | 
            +
                ds_channel_config_path: Optional[str] = None
         | 
| 144 145 | 
             
                ds_heavy_channel_num: int = 32
         | 
| 145 146 | 
             
                ds_heavy_token_num: int = 256
         | 
| 146 147 | 
             
                ds_heavy_channel_type: str = "qk"
         | 
| @@ -173,7 +174,7 @@ class ServerArgs: | |
| 173 174 | 
             
                enable_memory_saver: bool = False
         | 
| 174 175 | 
             
                allow_auto_truncate: bool = False
         | 
| 175 176 | 
             
                enable_custom_logit_processor: bool = False
         | 
| 176 | 
            -
                tool_call_parser: str = None
         | 
| 177 | 
            +
                tool_call_parser: Optional[str] = None
         | 
| 177 178 | 
             
                enable_hierarchical_cache: bool = False
         | 
| 178 179 | 
             
                hicache_ratio: float = 2.0
         | 
| 179 180 | 
             
                enable_flashinfer_mla: bool = False
         | 
| @@ -290,12 +291,17 @@ class ServerArgs: | |
| 290 291 | 
             
                        logger.warning(
         | 
| 291 292 | 
             
                            f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
         | 
| 292 293 | 
             
                        )
         | 
| 293 | 
            -
             | 
| 294 | 
            -
             | 
| 295 | 
            -
             | 
| 296 | 
            -
             | 
| 297 | 
            -
             | 
| 298 | 
            -
             | 
| 294 | 
            +
             | 
| 295 | 
            +
                    self.enable_sp_layernorm = False
         | 
| 296 | 
            +
                    # DeepEP MoE
         | 
| 297 | 
            +
                    if self.enable_deepep_moe:
         | 
| 298 | 
            +
                        self.ep_size = self.tp_size
         | 
| 299 | 
            +
                        self.enable_sp_layernorm = (
         | 
| 300 | 
            +
                            self.dp_size < self.tp_size if self.enable_dp_attention else True
         | 
| 301 | 
            +
                        )
         | 
| 302 | 
            +
                        logger.info(
         | 
| 303 | 
            +
                            f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
         | 
| 304 | 
            +
                        )
         | 
| 299 305 |  | 
| 300 306 | 
             
                    # Speculative Decoding
         | 
| 301 307 | 
             
                    if self.speculative_algorithm == "NEXTN":
         | 
| @@ -1200,8 +1206,12 @@ class PortArgs: | |
| 1200 1206 | 
             
                        # DP attention. Use TCP + port to handle both single-node and multi-node.
         | 
| 1201 1207 | 
             
                        if server_args.nnodes == 1 and server_args.dist_init_addr is None:
         | 
| 1202 1208 | 
             
                            dist_init_addr = ("127.0.0.1", server_args.port + ZMQ_TCP_PORT_DELTA)
         | 
| 1209 | 
            +
                        elif server_args.dist_init_addr.startswith("["):  # ipv6 address
         | 
| 1210 | 
            +
                            port_num, host = configure_ipv6(server_args.dist_init_addr)
         | 
| 1211 | 
            +
                            dist_init_addr = (host, str(port_num))
         | 
| 1203 1212 | 
             
                        else:
         | 
| 1204 1213 | 
             
                            dist_init_addr = server_args.dist_init_addr.split(":")
         | 
| 1214 | 
            +
             | 
| 1205 1215 | 
             
                        assert (
         | 
| 1206 1216 | 
             
                            len(dist_init_addr) == 2
         | 
| 1207 1217 | 
             
                        ), "please provide --dist-init-addr as host:port of head node"
         | 
| @@ -1210,10 +1220,10 @@ class PortArgs: | |
| 1210 1220 | 
             
                        port_base = int(dist_init_port) + 1
         | 
| 1211 1221 | 
             
                        if dp_rank is None:
         | 
| 1212 1222 | 
             
                            scheduler_input_port = (
         | 
| 1213 | 
            -
                                port_base +  | 
| 1223 | 
            +
                                port_base + 3
         | 
| 1214 1224 | 
             
                            )  # TokenizerManager to DataParallelController
         | 
| 1215 1225 | 
             
                        else:
         | 
| 1216 | 
            -
                            scheduler_input_port = port_base +  | 
| 1226 | 
            +
                            scheduler_input_port = port_base + 3 + 1 + dp_rank
         | 
| 1217 1227 |  | 
| 1218 1228 | 
             
                        return PortArgs(
         | 
| 1219 1229 | 
             
                            tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
         | 
| @@ -586,5 +586,5 @@ def load_token_map(token_map_path: str) -> List[int]: | |
| 586 586 | 
             
                        ignore_patterns=["*.bin", "*.safetensors"],
         | 
| 587 587 | 
             
                    )
         | 
| 588 588 | 
             
                    token_map_path = os.path.join(cache_dir, os.path.basename(token_map_path))
         | 
| 589 | 
            -
                hot_token_id = torch.load(token_map_path)
         | 
| 589 | 
            +
                hot_token_id = torch.load(token_map_path, weights_only=True)
         | 
| 590 590 | 
             
                return torch.tensor(hot_token_id, dtype=torch.int32)
         | 
    
        sglang/srt/utils.py
    CHANGED
    
    | @@ -1602,6 +1602,7 @@ def get_ip() -> str: | |
| 1602 1602 | 
             
            def get_open_port() -> int:
         | 
| 1603 1603 | 
             
                port = os.getenv("SGLANG_PORT")
         | 
| 1604 1604 | 
             
                if port is not None:
         | 
| 1605 | 
            +
                    port = int(port)
         | 
| 1605 1606 | 
             
                    while True:
         | 
| 1606 1607 | 
             
                        try:
         | 
| 1607 1608 | 
             
                            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         | 
| @@ -1630,6 +1631,38 @@ def is_valid_ipv6_address(address: str) -> bool: | |
| 1630 1631 | 
             
                    return False
         | 
| 1631 1632 |  | 
| 1632 1633 |  | 
| 1634 | 
            +
            def configure_ipv6(dist_init_addr):
         | 
| 1635 | 
            +
                addr = dist_init_addr
         | 
| 1636 | 
            +
                end = addr.find("]")
         | 
| 1637 | 
            +
                if end == -1:
         | 
| 1638 | 
            +
                    raise ValueError("invalid IPv6 address format: missing ']'")
         | 
| 1639 | 
            +
             | 
| 1640 | 
            +
                host = addr[: end + 1]
         | 
| 1641 | 
            +
             | 
| 1642 | 
            +
                # this only validates the address without brackets: we still need the below checks.
         | 
| 1643 | 
            +
                # if it's invalid, immediately raise an error so we know it's not formatting issues.
         | 
| 1644 | 
            +
                if not is_valid_ipv6_address(host[1:end]):
         | 
| 1645 | 
            +
                    raise ValueError(f"invalid IPv6 address: {host}")
         | 
| 1646 | 
            +
             | 
| 1647 | 
            +
                port_str = None
         | 
| 1648 | 
            +
                if len(addr) > end + 1:
         | 
| 1649 | 
            +
                    if addr[end + 1] == ":":
         | 
| 1650 | 
            +
                        port_str = addr[end + 2 :]
         | 
| 1651 | 
            +
                    else:
         | 
| 1652 | 
            +
                        raise ValueError("received IPv6 address format: expected ':' after ']'")
         | 
| 1653 | 
            +
             | 
| 1654 | 
            +
                if not port_str:
         | 
| 1655 | 
            +
                    raise ValueError(
         | 
| 1656 | 
            +
                        "a port must be specified in IPv6 address (format: [ipv6]:port)"
         | 
| 1657 | 
            +
                    )
         | 
| 1658 | 
            +
             | 
| 1659 | 
            +
                try:
         | 
| 1660 | 
            +
                    port = int(port_str)
         | 
| 1661 | 
            +
                except ValueError:
         | 
| 1662 | 
            +
                    raise ValueError(f"invalid port in IPv6 address: '{port_str}'")
         | 
| 1663 | 
            +
                return port, host
         | 
| 1664 | 
            +
             | 
| 1665 | 
            +
             | 
| 1633 1666 | 
             
            def rank0_print(msg: str):
         | 
| 1634 1667 | 
             
                from sglang.srt.distributed import get_tensor_model_parallel_rank
         | 
| 1635 1668 |  | 
    
        sglang/test/runners.py
    CHANGED
    
    | @@ -19,10 +19,16 @@ from typing import List, Optional, Tuple, Union | |
| 19 19 |  | 
| 20 20 | 
             
            import torch
         | 
| 21 21 | 
             
            import torch.nn.functional as F
         | 
| 22 | 
            -
            from transformers import  | 
| 22 | 
            +
            from transformers import (
         | 
| 23 | 
            +
                AutoModel,
         | 
| 24 | 
            +
                AutoModelForCausalLM,
         | 
| 25 | 
            +
                AutoModelForVision2Seq,
         | 
| 26 | 
            +
                AutoProcessor,
         | 
| 27 | 
            +
            )
         | 
| 23 28 |  | 
| 24 29 | 
             
            from sglang.srt.hf_transformers_utils import get_tokenizer
         | 
| 25 30 | 
             
            from sglang.srt.server import Engine
         | 
| 31 | 
            +
            from sglang.srt.utils import load_image
         | 
| 26 32 | 
             
            from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
         | 
| 27 33 |  | 
| 28 34 | 
             
            DEFAULT_PROMPTS = [
         | 
| @@ -140,7 +146,6 @@ class HFRunner: | |
| 140 146 | 
             
                def _get_gme_qwen2_vl_embeddings(
         | 
| 141 147 | 
             
                    self, prompts, image_data: Optional[List[str]] = None
         | 
| 142 148 | 
             
                ):
         | 
| 143 | 
            -
                    from sglang.srt.utils import load_image
         | 
| 144 149 |  | 
| 145 150 | 
             
                    images = None
         | 
| 146 151 | 
             
                    if image_data is not None:
         | 
| @@ -226,6 +231,9 @@ class HFRunner: | |
| 226 231 | 
             
                                low_cpu_mem_usage=True,
         | 
| 227 232 | 
             
                            ).cuda()
         | 
| 228 233 | 
             
                            self.processor = AutoProcessor.from_pretrained(model_path)
         | 
| 234 | 
            +
                        elif "clip" in model_path.lower():
         | 
| 235 | 
            +
                            self.model = AutoModel.from_pretrained(model_path).cuda()
         | 
| 236 | 
            +
                            self.processor = AutoProcessor.from_pretrained(model_path)
         | 
| 229 237 | 
             
                        else:
         | 
| 230 238 | 
             
                            self.model = _get_sentence_transformer_embedding_model(
         | 
| 231 239 | 
             
                                model_path, torch_dtype
         | 
| @@ -272,6 +280,23 @@ class HFRunner: | |
| 272 280 | 
             
                                assert not self.output_str_only
         | 
| 273 281 | 
             
                                if "gme-qwen2-vl" in model_path.lower():
         | 
| 274 282 | 
             
                                    logits = self._get_gme_qwen2_vl_embeddings(prompts, image_data)
         | 
| 283 | 
            +
                                elif "clip" in model_path.lower():
         | 
| 284 | 
            +
                                    if image_data is not None:
         | 
| 285 | 
            +
                                        image = load_image(image_data)
         | 
| 286 | 
            +
                                        inputs = self.processor(
         | 
| 287 | 
            +
                                            images=image[0], return_tensors="pt"
         | 
| 288 | 
            +
                                        )
         | 
| 289 | 
            +
                                        logits = self.model.get_image_features(
         | 
| 290 | 
            +
                                            pixel_values=inputs.data["pixel_values"].cuda(),
         | 
| 291 | 
            +
                                        ).tolist()
         | 
| 292 | 
            +
                                    else:
         | 
| 293 | 
            +
                                        inputs = self.tokenizer(
         | 
| 294 | 
            +
                                            prompts, padding=True, return_tensors="pt"
         | 
| 295 | 
            +
                                        )
         | 
| 296 | 
            +
                                        logits = self.model.get_text_features(
         | 
| 297 | 
            +
                                            input_ids=inputs.data["input_ids"].cuda(),
         | 
| 298 | 
            +
                                            attention_mask=inputs.data["attention_mask"].cuda(),
         | 
| 299 | 
            +
                                        ).tolist()
         | 
| 275 300 | 
             
                                else:
         | 
| 276 301 | 
             
                                    logits = self.model.encode(prompts).tolist()
         | 
| 277 302 | 
             
                                out_queue.put(ModelOutput(embed_logits=logits))
         | 
    
        sglang/test/test_utils.py
    CHANGED
    
    | @@ -29,7 +29,7 @@ from sglang.srt.utils import get_bool_env_var, kill_process_tree | |
| 29 29 | 
             
            from sglang.test.run_eval import run_eval
         | 
| 30 30 | 
             
            from sglang.utils import get_exception_traceback
         | 
| 31 31 |  | 
| 32 | 
            -
            DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
         | 
| 32 | 
            +
            DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
         | 
| 33 33 | 
             
            DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
         | 
| 34 34 | 
             
            DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
         | 
| 35 35 | 
             
                "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
         | 
    
        sglang/version.py
    CHANGED
    
    | @@ -1 +1 @@ | |
| 1 | 
            -
            __version__ = "0.4.4. | 
| 1 | 
            +
            __version__ = "0.4.4.post3"
         | 
| @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            Metadata-Version: 2.4
         | 
| 2 2 | 
             
            Name: sglang
         | 
| 3 | 
            -
            Version: 0.4.4. | 
| 3 | 
            +
            Version: 0.4.4.post3
         | 
| 4 4 | 
             
            Summary: SGLang is yet another fast serving framework for large language models and vision language models.
         | 
| 5 5 | 
             
            License:                                  Apache License
         | 
| 6 6 | 
             
                                               Version 2.0, January 2004
         | 
| @@ -218,6 +218,7 @@ Requires-Dist: numpy | |
| 218 218 | 
             
            Requires-Dist: IPython
         | 
| 219 219 | 
             
            Requires-Dist: setproctitle
         | 
| 220 220 | 
             
            Provides-Extra: runtime-common
         | 
| 221 | 
            +
            Requires-Dist: compressed-tensors; extra == "runtime-common"
         | 
| 221 222 | 
             
            Requires-Dist: datasets; extra == "runtime-common"
         | 
| 222 223 | 
             
            Requires-Dist: decord; extra == "runtime-common"
         | 
| 223 224 | 
             
            Requires-Dist: fastapi; extra == "runtime-common"
         | 
| @@ -240,14 +241,17 @@ Requires-Dist: torchao>=0.7.0; extra == "runtime-common" | |
| 240 241 | 
             
            Requires-Dist: transformers==4.50.0; extra == "runtime-common"
         | 
| 241 242 | 
             
            Requires-Dist: uvicorn; extra == "runtime-common"
         | 
| 242 243 | 
             
            Requires-Dist: uvloop; extra == "runtime-common"
         | 
| 243 | 
            -
            Requires-Dist:  | 
| 244 | 
            +
            Requires-Dist: compressed-tensors; extra == "runtime-common"
         | 
| 245 | 
            +
            Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
         | 
| 244 246 | 
             
            Provides-Extra: srt
         | 
| 245 247 | 
             
            Requires-Dist: sglang[runtime_common]; extra == "srt"
         | 
| 246 | 
            -
            Requires-Dist: sgl-kernel==0.0.5. | 
| 248 | 
            +
            Requires-Dist: sgl-kernel==0.0.5.post4; extra == "srt"
         | 
| 247 249 | 
             
            Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
         | 
| 248 250 | 
             
            Requires-Dist: torch==2.5.1; extra == "srt"
         | 
| 249 251 | 
             
            Requires-Dist: cuda-python; extra == "srt"
         | 
| 250 252 | 
             
            Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
         | 
| 253 | 
            +
            Requires-Dist: partial_json_parser; extra == "srt"
         | 
| 254 | 
            +
            Requires-Dist: einops; extra == "srt"
         | 
| 251 255 | 
             
            Provides-Extra: srt-hip
         | 
| 252 256 | 
             
            Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
         | 
| 253 257 | 
             
            Requires-Dist: torch; extra == "srt-hip"
         | 
| @@ -271,7 +275,7 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic" | |
| 271 275 | 
             
            Provides-Extra: litellm
         | 
| 272 276 | 
             
            Requires-Dist: litellm>=1.0.0; extra == "litellm"
         | 
| 273 277 | 
             
            Provides-Extra: torch-memory-saver
         | 
| 274 | 
            -
            Requires-Dist: torch_memory_saver>=0.0. | 
| 278 | 
            +
            Requires-Dist: torch_memory_saver>=0.0.4; extra == "torch-memory-saver"
         | 
| 275 279 | 
             
            Provides-Extra: test
         | 
| 276 280 | 
             
            Requires-Dist: jsonlines; extra == "test"
         | 
| 277 281 | 
             
            Requires-Dist: matplotlib; extra == "test"
         |