PyPI - sglang - Versions diffs - 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl - Mend

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_offline_throughput.py +4 -2
sglang/bench_one_batch.py +3 -13
sglang/bench_one_batch_server.py +143 -15
sglang/bench_serving.py +158 -8
sglang/compile_deep_gemm.py +1 -1
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +119 -75
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +5 -2
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/internvl.py +696 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +18 -0
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +71 -53
sglang/srt/conversation.py +78 -46
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +11 -3
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +74 -23
sglang/srt/disaggregation/mooncake/conn.py +236 -138
sglang/srt/disaggregation/nixl/conn.py +242 -71
sglang/srt/disaggregation/prefill.py +7 -4
sglang/srt/disaggregation/utils.py +51 -2
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
sglang/srt/distributed/device_communicators/pynccl.py +2 -1
sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
sglang/srt/distributed/parallel_state.py +22 -1
sglang/srt/entrypoints/engine.py +31 -4
sglang/srt/entrypoints/http_server.py +45 -3
sglang/srt/entrypoints/verl_engine.py +3 -2
sglang/srt/function_call_parser.py +2 -2
sglang/srt/hf_transformers_utils.py +20 -1
sglang/srt/layers/attention/flashattention_backend.py +147 -51
sglang/srt/layers/attention/flashinfer_backend.py +23 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
sglang/srt/layers/attention/merge_state.py +46 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
sglang/srt/layers/attention/utils.py +4 -2
sglang/srt/layers/attention/vision.py +290 -163
sglang/srt/layers/dp_attention.py +71 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
sglang/srt/layers/moe/ep_moe/layer.py +121 -2
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/topk.py +1 -1
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
sglang/srt/layers/quantization/deep_gemm.py +77 -71
sglang/srt/layers/quantization/fp8.py +110 -97
sglang/srt/layers/quantization/fp8_kernel.py +81 -62
sglang/srt/layers/quantization/fp8_utils.py +71 -23
sglang/srt/layers/quantization/int8_kernel.py +2 -2
sglang/srt/layers/quantization/kv_cache.py +3 -10
sglang/srt/layers/quantization/utils.py +0 -5
sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +11 -14
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +115 -119
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/io_struct.py +13 -1
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
sglang/srt/managers/multimodal_processors/internvl.py +232 -0
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/schedule_batch.py +93 -23
sglang/srt/managers/schedule_policy.py +11 -8
sglang/srt/managers/scheduler.py +140 -100
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/tokenizer_manager.py +157 -47
sglang/srt/managers/tp_worker.py +21 -21
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/chunk_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +4 -2
sglang/srt/metrics/collector.py +312 -37
sglang/srt/model_executor/cuda_graph_runner.py +10 -11
sglang/srt/model_executor/forward_batch_info.py +1 -1
sglang/srt/model_executor/model_runner.py +57 -41
sglang/srt/model_loader/loader.py +18 -11
sglang/srt/models/clip.py +4 -4
sglang/srt/models/deepseek_janus_pro.py +3 -3
sglang/srt/models/deepseek_nextn.py +1 -20
sglang/srt/models/deepseek_v2.py +77 -39
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/internlm2.py +3 -0
sglang/srt/models/internvl.py +670 -0
sglang/srt/models/llama.py +3 -1
sglang/srt/models/llama4.py +58 -13
sglang/srt/models/llava.py +248 -5
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/phi3_small.py +16 -2
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/qwen2_5_vl.py +8 -4
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/models/xiaomi_mimo.py +171 -0
sglang/srt/openai_api/adapter.py +52 -42
sglang/srt/openai_api/protocol.py +20 -16
sglang/srt/reasoning_parser.py +1 -1
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +2 -2
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +64 -10
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +7 -7
sglang/srt/speculative/eagle_worker.py +22 -19
sglang/srt/utils.py +41 -6
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deepep_utils.py +219 -0
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +92 -15
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0

sglang/srt/distributed/device_communicators/pynccl.py CHANGED Viewed

@@ -75,7 +75,8 @@ class PyNcclCommunicator:
         self.available = True
         self.disabled = False
-        logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
+        if self.rank == 0:
+            logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
         if self.rank == 0:
             # get the unique id from NCCL

sglang/srt/distributed/device_communicators/shm_broadcast.py CHANGED Viewed

@@ -225,7 +225,8 @@ class MessageQueue:
             remote_subscribe_port = get_open_port()
             if is_valid_ipv6_address(connect_ip):
                 self.remote_socket.setsockopt(IPV6, 1)
-            socket_addr = f"tcp://*:{remote_subscribe_port}"
+                connect_ip = f"[{connect_ip}]"
+            socket_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"
             self.remote_socket.bind(socket_addr)
         else:

sglang/srt/distributed/parallel_state.py CHANGED Viewed

@@ -42,6 +42,7 @@ from torch.distributed import Backend, ProcessGroup
 from sglang.srt.utils import (
     direct_register_custom_op,
     is_cuda_alike,
+    is_npu,
     supports_custom_op,
 )
@@ -206,6 +207,7 @@ class GroupCoordinator:
         use_custom_allreduce: bool,
         use_hpu_communicator: bool,
         use_xpu_communicator: bool,
+        use_npu_communicator: bool,
         use_message_queue_broadcaster: bool = False,
         group_name: Optional[str] = None,
     ):
@@ -244,6 +246,7 @@ class GroupCoordinator:
         self.use_custom_allreduce = use_custom_allreduce
         self.use_hpu_communicator = use_hpu_communicator
         self.use_xpu_communicator = use_xpu_communicator
+        self.use_npu_communicator = use_npu_communicator
         self.use_message_queue_broadcaster = use_message_queue_broadcaster
         # lazy import to avoid documentation build error
@@ -291,6 +294,14 @@ class GroupCoordinator:
         if use_xpu_communicator and self.world_size > 1:
             self.xpu_communicator = XpuCommunicator(group=self.device_group)
+        from sglang.srt.distributed.device_communicators.npu_communicator import (
+            NpuCommunicator,
+        )
+        self.npu_communicator: Optional[NpuCommunicator] = None
+        if use_npu_communicator and self.world_size > 1:
+            self.npu_communicator = NpuCommunicator(group=self.device_group)
         from sglang.srt.distributed.device_communicators.shm_broadcast import (
             MessageQueue,
         )
@@ -418,6 +429,9 @@ class GroupCoordinator:
         if self.xpu_communicator is not None and not self.xpu_communicator.disabled:
             return self.xpu_communicator.all_reduce(input_)
+        if self.npu_communicator is not None and not self.npu_communicator.disabled:
+            return self.npu_communicator.all_reduce(input_)
         if (
             self.ca_comm is not None
             and not self.ca_comm.disabled
@@ -497,6 +511,11 @@ class GroupCoordinator:
         if hpu_comm is not None and not hpu_comm.disabled:
             return hpu_comm.all_gather(input_, dim)
+        # For NPUs, use NPU communicator.
+        npu_comm = self.npu_communicator
+        if npu_comm is not None and not npu_comm.disabled:
+            return npu_comm.all_gather(input_, dim)
         if dim < 0:
             # Convert negative dim to positive.
             dim += input_.dim()
@@ -941,6 +960,7 @@ def init_world_group(
         use_custom_allreduce=False,
         use_hpu_communicator=False,
         use_xpu_communicator=False,
+        use_npu_communicator=False,
         group_name="world",
     )
@@ -959,10 +979,11 @@ def init_model_parallel_group(
         group_ranks=group_ranks,
         local_rank=local_rank,
         torch_distributed_backend=backend,
-        use_pynccl=True,
+        use_pynccl=not is_npu(),
         use_custom_allreduce=use_custom_allreduce,
         use_hpu_communicator=True,
         use_xpu_communicator=True,
+        use_npu_communicator=True,
         use_message_queue_broadcaster=use_message_queue_broadcaster,
         group_name=group_name,
     )

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -163,6 +163,9 @@ class Engine(EngineBase):
         custom_logit_processor: Optional[Union[List[str], str]] = None,
         return_hidden_states: bool = False,
         stream: bool = False,
+        bootstrap_host: Optional[Union[List[str], str]] = None,
+        bootstrap_port: Optional[Union[List[int], int]] = None,
+        bootstrap_room: Optional[Union[List[int], int]] = None,
     ) -> Union[Dict, Iterator[Dict]]:
         """
         The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
@@ -181,6 +184,9 @@ class Engine(EngineBase):
             custom_logit_processor=custom_logit_processor,
             return_hidden_states=return_hidden_states,
             stream=stream,
+            bootstrap_host=bootstrap_host,
+            bootstrap_port=bootstrap_port,
+            bootstrap_room=bootstrap_room,
         )
         loop = asyncio.get_event_loop()
         generator = self.tokenizer_manager.generate_request(obj, None)
@@ -227,6 +233,9 @@ class Engine(EngineBase):
         lora_path: Optional[List[Optional[str]]] = None,
         custom_logit_processor: Optional[Union[List[str], str]] = None,
         stream: bool = False,
+        bootstrap_host: Optional[Union[List[str], str]] = None,
+        bootstrap_port: Optional[Union[List[int], int]] = None,
+        bootstrap_room: Optional[Union[List[int], int]] = None,
     ) -> Union[Dict, AsyncIterator[Dict]]:
         """
         The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
@@ -244,6 +253,9 @@ class Engine(EngineBase):
             lora_path=lora_path,
             stream=stream,
             custom_logit_processor=custom_logit_processor,
+            bootstrap_host=bootstrap_host,
+            bootstrap_port=bootstrap_port,
+            bootstrap_room=bootstrap_room,
         )
         generator = self.tokenizer_manager.generate_request(obj, None)
@@ -273,6 +285,21 @@ class Engine(EngineBase):
         ret = loop.run_until_complete(generator.__anext__())
         return ret
+    async def async_encode(
+        self,
+        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
+        image_data: Optional[Union[List[str], str]] = None,
+    ) -> Dict:
+        """
+        Asynchronous version of encode method.
+        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
+        Please refer to `EmbeddingReqInput` for the documentation.
+        """
+        obj = EmbeddingReqInput(text=prompt, image_data=image_data)
+        generator = self.tokenizer_manager.generate_request(obj, None)
+        return await generator.__anext__()
     def shutdown(self):
         """Shutdown the engine"""
         kill_process_tree(os.getpid(), include_parent=False)
@@ -303,7 +330,7 @@ class Engine(EngineBase):
         return {
             **dataclasses.asdict(self.tokenizer_manager.server_args),
             **self.scheduler_info,
-            **internal_states,
+            "internal_states": internal_states,
             "version": __version__,
         }
@@ -348,8 +375,8 @@ class Engine(EngineBase):
         load_format: Optional[str] = None,
         flush_cache: bool = True,
     ):
-        """Update weights from distributed source. If there are going to be more updates, set `flush_cache` to be true
-        to avoid duplicated operations such as clearing cache."""
+        """Update weights from distributed source. If there are going to be more updates, set `flush_cache` to be false
+        to avoid duplicated cache cleaning operation."""
         obj = UpdateWeightsFromTensorReqInput(
             serialized_named_tensors=[
                 MultiprocessingSerializer.serialize(named_tensors)
@@ -459,7 +486,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if _is_cuda:
         assert_pkg_version(
             "sgl-kernel",
-            "0.1.1",
+            "0.1.2.post1",
             "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
         )

sglang/srt/entrypoints/http_server.py CHANGED Viewed

@@ -42,10 +42,14 @@ from fastapi import FastAPI, File, Form, Request, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import ORJSONResponse, Response, StreamingResponse
-from sglang.srt.disaggregation.utils import FakeBootstrapHost
+from sglang.srt.disaggregation.utils import (
+    FakeBootstrapHost,
+    register_disaggregation_server,
+)
 from sglang.srt.entrypoints.engine import _launch_subprocesses
 from sglang.srt.function_call_parser import FunctionCallParser
 from sglang.srt.managers.io_struct import (
+    AbortReq,
     CloseSessionReqInput,
     ConfigureLoggingReq,
     EmbeddingReqInput,
@@ -59,6 +63,7 @@ from sglang.srt.managers.io_struct import (
     ResumeMemoryOccupationReqInput,
     SeparateReasoningReqInput,
     SetInternalStateReq,
+    SlowDownReqInput,
     UpdateWeightFromDiskReqInput,
     UpdateWeightsFromDistributedReqInput,
     UpdateWeightsFromTensorReqInput,
@@ -217,7 +222,7 @@ async def get_server_info():
     return {
         **dataclasses.asdict(_global_state.tokenizer_manager.server_args),
         **_global_state.scheduler_info,
-        **internal_states,
+        "internal_states": internal_states,
         "version": __version__,
     }
@@ -333,7 +338,11 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
         obj = ProfileReqInput()
     await _global_state.tokenizer_manager.start_profile(
-        obj.output_dir, obj.num_steps, obj.activities
+        output_dir=obj.output_dir,
+        num_steps=obj.num_steps,
+        activities=obj.activities,
+        with_stack=obj.with_stack,
+        record_shapes=obj.record_shapes,
     )
     return Response(
         content="Start profiling.\n",
@@ -491,6 +500,19 @@ async def resume_memory_occupation(
         return _create_error_response(e)
+@app.api_route("/slow_down", methods=["GET", "POST"])
+async def slow_down(obj: SlowDownReqInput, request: Request):
+    """Slow down the system deliberately. Only for testing. Example scenario:
+    when we want to test performance of D in large-scale PD disaggregation and have no enough nodes for P,
+    we can use this to slow down D to let it have enough running sequences, and then disable slowdown
+    to let it run in full batch size.
+    """
+    try:
+        await _global_state.tokenizer_manager.slow_down(obj, request)
+    except Exception as e:
+        return _create_error_response(e)
 @app.api_route("/open_session", methods=["GET", "POST"])
 async def open_session(obj: OpenSessionReqInput, request: Request):
     """Open a session, and return its unique session id."""
@@ -522,6 +544,16 @@ async def configure_logging(obj: ConfigureLoggingReq, request: Request):
     return Response(status_code=200)
+@app.post("/abort_request")
+async def abort_request(obj: AbortReq, request: Request):
+    """Abort a request."""
+    try:
+        _global_state.tokenizer_manager.abort_request(rid=obj.rid)
+        return Response(status_code=200)
+    except Exception as e:
+        return _create_error_response(e)
 @app.post("/parse_function_call")
 async def parse_function_call_request(obj: ParseFunctionCallReq, request: Request):
     """
@@ -675,6 +707,8 @@ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Reque
         **(vertex_req.parameters or {}),
     )
     ret = await generate_request(req, raw_request)
+    if isinstance(ret, Response):
+        return ret
     return ORJSONResponse({"predictions": ret})
@@ -869,5 +903,13 @@ def _wait_and_warmup(
     if server_args.debug_tensor_dump_input_file:
         kill_process_tree(os.getpid())
+    if server_args.pdlb_url is not None:
+        register_disaggregation_server(
+            server_args.disaggregation_mode,
+            server_args.port,
+            server_args.disaggregation_bootstrap_port,
+            server_args.pdlb_url,
+        )
     if launch_callback is not None:
         launch_callback()

sglang/srt/entrypoints/verl_engine.py CHANGED Viewed

@@ -37,6 +37,7 @@ class VerlEngine:
         monkey_patch_torch_reductions()
         self._device_mesh_cpu = device_mesh_cpu
         self._tp_rank = device_mesh_cpu.get_local_rank()
+        self._rank = device_mesh_cpu.get_rank()
         self._tp_size = device_mesh_cpu.size()
         tp_size_per_node = self._tp_size // nnodes
         node_rank = self._tp_rank // tp_size_per_node
@@ -114,7 +115,7 @@ class VerlEngine:
         # Most naive implementation, can extract tensor and send via gloo if too slow
         [output] = broadcast_pyobj(
             data=[output],
-            rank=self._tp_rank,
+            rank=self._rank,
             dist_group=self._device_mesh_cpu.get_group(),
             src=self._device_mesh_cpu.mesh[0].item(),
             force_cpu_device=False,
@@ -157,7 +158,7 @@ class VerlEngine:
                 )
         if self._tp_rank == 0:
-            self._engine.tokenizer_manager.flush_cache()
+            self._engine.flush_cache()
     def release_memory_occupation(self):
         if self._tp_rank == 0:

sglang/srt/function_call_parser.py CHANGED Viewed

@@ -86,8 +86,8 @@ class StructureInfo:
 _GetInfoFunc = Callable[[str], StructureInfo]
 """
-helper alias of function
-ususally it is a function that takes a name string and returns a StructureInfo object,
+Helper alias of function
+Usually it is a function that takes a name string and returns a StructureInfo object,
 which can be used to construct a structural_tag object
 """

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -19,6 +19,7 @@ import warnings
 from pathlib import Path
 from typing import Dict, Optional, Type, Union
+import transformers
 from huggingface_hub import snapshot_download
 from transformers import (
     AutoConfig,
@@ -26,6 +27,7 @@ from transformers import (
     AutoTokenizer,
     PretrainedConfig,
     PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
     PreTrainedTokenizerFast,
 )
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
@@ -38,6 +40,7 @@ from sglang.srt.configs import (
     KimiVLConfig,
     MultiModalityConfig,
 )
+from sglang.srt.configs.internvl import InternVLChatConfig
 from sglang.srt.connector import create_remote_connector
 from sglang.srt.utils import is_remote_url
@@ -48,6 +51,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     DeepseekVL2Config.model_type: DeepseekVL2Config,
     MultiModalityConfig.model_type: MultiModalityConfig,
     KimiVLConfig.model_type: KimiVLConfig,
+    InternVLChatConfig.model_type: InternVLChatConfig,
 }
 for name, cls in _CONFIG_REGISTRY.items():
@@ -90,6 +94,12 @@ def get_config(
         config = config_class.from_pretrained(model, revision=revision)
         # NOTE(HandH1998): Qwen2VL requires `_name_or_path` attribute in `config`.
         setattr(config, "_name_or_path", model)
+    if isinstance(model, str) and config.model_type == "internvl_chat":
+        for key, val in config.llm_config.__dict__.items():
+            if not hasattr(config, key):
+                setattr(config, key, val)
     if model_override_args:
         config.update(model_override_args)
@@ -211,6 +221,13 @@ def get_tokenizer(
     return tokenizer
+# Some models doesn't have an available processor, e.g.: InternVL
+def get_tokenizer_from_processor(processor):
+    if isinstance(processor, PreTrainedTokenizerBase):
+        return processor
+    return processor.tokenizer
 def get_processor(
     tokenizer_name: str,
     *args,
@@ -246,7 +263,9 @@ def get_processor(
         **kwargs,
     )
-    attach_additional_stop_token_ids(processor.tokenizer)
+    tokenizer = get_tokenizer_from_processor(processor)
+    attach_additional_stop_token_ids(tokenizer)
     return processor

sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post4py3-none-any.whl