PyPI - sglang - Versions diffs - 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +18 -3
sglang/compile_deep_gemm.py +13 -7
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +120 -0
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +25 -2
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -5
sglang/srt/entrypoints/engine.py +13 -5
sglang/srt/entrypoints/http_server.py +22 -3
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +7 -0
sglang/srt/eplb/expert_distribution.py +34 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +7 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +3 -1
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +10 -2
sglang/srt/layers/communicator.py +23 -1
sglang/srt/layers/layernorm.py +16 -2
sglang/srt/layers/logits_processor.py +4 -20
sglang/srt/layers/moe/ep_moe/layer.py +0 -18
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/moe_runner/deep_gemm.py +53 -33
sglang/srt/layers/moe/token_dispatcher/deepep.py +12 -9
sglang/srt/layers/moe/topk.py +31 -6
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +9 -78
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/rotary_embedding.py +117 -45
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +26 -4
sglang/srt/managers/multi_tokenizer_mixin.py +5 -0
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +164 -129
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +154 -59
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/memory_pool.py +171 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +11 -11
sglang/srt/model_executor/model_runner.py +76 -21
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +149 -34
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +0 -1
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +1 -1
sglang/srt/models/qwen3_moe.py +16 -8
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +103 -22
sglang/srt/single_batch_overlap.py +4 -1
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +55 -32
sglang/srt/utils/hf_transformers_utils.py +38 -16
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +10 -24
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +150 -136
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/configs/deepseekvl2.py CHANGED Viewed

@@ -11,8 +11,6 @@ from transformers import (
     ProcessorMixin,
 )
-from sglang.srt.configs.deepseek_ocr import BASE_SIZE, IMAGE_SIZE, MAX_CROPS, MIN_CROPS
 def select_best_resolution(image_size, candidate_resolutions):
     # used for cropping
@@ -63,7 +61,6 @@ class DictOutput(object):
 class VLChatProcessorOutput(DictOutput):
     input_ids: torch.LongTensor
     target_ids: torch.LongTensor
-    images_crop: torch.LongTensor
     pixel_values: (
         torch.Tensor
     )  # rename from "images" to "pixel_values" for compatibility
@@ -107,68 +104,6 @@ class ImageTransform(object):
         return x
-def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
-    best_ratio_diff = float("inf")
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-def dynamic_preprocess(
-    image, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False
-):
-    orig_width, orig_height = image.size
-    aspect_ratio = orig_width / orig_height
-    # calculate the existing image aspect ratio
-    target_ratios = set(
-        (i, j)
-        for n in range(min_num, max_num + 1)
-        for i in range(1, n + 1)
-        for j in range(1, n + 1)
-        if i * j <= max_num and i * j >= min_num
-    )
-    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio, target_ratios, orig_width, orig_height, image_size
-    )
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size,
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-    assert len(processed_images) == blocks
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-    return processed_images, target_aspect_ratio
 class DeepseekVLV2Processor(ProcessorMixin):
     tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
     attributes = ["tokenizer"]
@@ -198,7 +133,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
         self.image_std = image_std
         self.normalize = normalize
         self.downsample_ratio = downsample_ratio
-        self.base_size = BASE_SIZE
         self.image_transform = ImageTransform(
             mean=image_mean, std=image_std, normalize=normalize
         )
@@ -241,7 +176,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
             **kwargs,
         )
-    def format_messages_v2(self, messages: str, pil_images, max_req_input_len=-1):
+    def format_messages_v2(self, messages, pil_images, max_req_input_len=-1):
         """play the role of format_messages_v2 and get_images_info in the last version"""
         tokenized_data = []
         masked_tokenized_data = []  # labels
@@ -251,34 +186,35 @@ class DeepseekVLV2Processor(ProcessorMixin):
         image_index = 0
         image_token_cnt = messages.count(self.image_token)
-        (
-            input_ids,
-            images,
-            images_crop,
-            seq_mask,
-            spatial_crop,
-            num_image_tokens,
-            image_shapes,
-        ) = self.tokenize_with_images(
+        tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images(
             messages,
             pil_images[image_index : image_index + image_token_cnt],
             bos=True,
             eos=True,
             cropping=len(pil_images) <= 2,
+            max_req_input_len=max_req_input_len,
         )
         image_index = image_token_cnt
+        tokenized_data += tokenized_str
+        if self.mask_prompt:
+            masked_tokenized_data += [self.ignore_id] * len(tokenized_str)
+        else:
+            masked_tokenized_data += tokenized_str
         images_list += images
         images_seq_mask += seq_mask
-        images_spatial_crop = spatial_crop
+        images_spatial_crop += spatial_crop
+        assert len(tokenized_data) == len(
+            images_seq_mask
+        ), f"format_messages_v2: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
         return (
-            input_ids,
+            tokenized_data,
             masked_tokenized_data,
             images_list,
             images_seq_mask,
             images_spatial_crop,
-            images_crop,
         )
     @property
@@ -315,7 +251,6 @@ class DeepseekVLV2Processor(ProcessorMixin):
         inference_mode: bool = True,
         system_prompt: str = "",
         max_req_input_len: int = -1,
-        cropping: bool = True,
         **kwargs,
     ):
         """
@@ -339,22 +274,47 @@ class DeepseekVLV2Processor(ProcessorMixin):
                 - num_image_tokens (List[int]): the number of image tokens
         """
-        prompt = conversations or prompt
+        assert (
+            prompt is None or conversations is None
+        ), "prompt and conversations cannot be used at the same time."
         (
-            input_ids,
+            tokenized_str,
             masked_tokenized_str,
             images_list,
             images_seq_mask,
             images_spatial_crop,
-            images_crop,
-        ) = self.format_messages_v2(prompt, images, max_req_input_len)
+        ) = self.format_messages_v2(conversations, images, max_req_input_len)
+        assert (
+            len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
+        ), (
+            f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
+            f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
+        )
+        input_ids = torch.LongTensor(tokenized_str)
         target_ids = torch.LongTensor(masked_tokenized_str)
+        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
+        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
+        target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
+            self.ignore_id
+        )
+        input_ids[input_ids < 0] = self.pad_id
+        if inference_mode:
+            assert input_ids[-1] == self.eos_id
+            input_ids = input_ids[:-1]
+            target_ids = target_ids[:-1]
+            images_seq_mask = images_seq_mask[:-1]
         if len(images_list) == 0:
             images = torch.zeros((1, 3, self.image_size, self.image_size))
+            images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
         else:
             images = torch.stack(images_list, dim=0)
+            images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
         images_spatial_crop = torch.stack(
             [images_spatial_crop], dim=0
@@ -363,7 +323,6 @@ class DeepseekVLV2Processor(ProcessorMixin):
         prepare = VLChatProcessorOutput(
             input_ids=input_ids,
             target_ids=target_ids,
-            images_crop=images_crop,
             pixel_values=images,
             images_seq_mask=images_seq_mask,
             images_spatial_crop=images_spatial_crop,
@@ -381,14 +340,10 @@ class DeepseekVLV2Processor(ProcessorMixin):
         inference_mode: bool = True,
         system_prompt: str = "",
         max_req_input_len: int = -1,
-        text: list[str] = None,
         **kwargs,
     ):
-        assert text is None or isinstance(text, list)
-        if text is not None:
-            text = text[0]
         prepare = self.process_one(
-            prompt=prompt or text,
+            prompt=prompt,
             conversations=conversations,
             images=images,
             apply_sft_format=apply_sft_format,
@@ -413,83 +368,85 @@ class DeepseekVLV2Processor(ProcessorMixin):
         bos: bool = True,
         eos: bool = True,
         cropping: bool = True,
+        max_req_input_len: int = -1,
     ):
         """Tokenize text with <image> tags."""
-        conversation = conversation
-        assert conversation.count(self.image_token) == len(images)
+        images_list, images_seq_mask, images_spatial_crop = [], [], []
         text_splits = conversation.split(self.image_token)
-        images_list, images_crop_list, images_seq_mask, images_spatial_crop = (
-            [],
-            [],
-            [],
-            [],
-        )
-        image_shapes = []
-        num_image_tokens = []
         tokenized_str = []
         for text_sep, image in zip(text_splits, images):
             """encode text_sep"""
             tokenized_sep = self.encode(text_sep, bos=False, eos=False)
             tokenized_str += tokenized_sep
             images_seq_mask += [False] * len(tokenized_sep)
-            image_shapes.append(image.size)
-            if image.size[0] <= 640 and image.size[1] <= 640:
-                crop_ratio = [1, 1]
+            """select best resolution for anyres"""
+            if cropping:
+                best_width, best_height = select_best_resolution(
+                    image.size, self.candidate_resolutions
+                )
             else:
-                if cropping:
-                    images_crop_raw, crop_ratio = dynamic_preprocess(
-                        image, image_size=IMAGE_SIZE
-                    )
-                else:
-                    crop_ratio = [1, 1]
+                best_width, best_height = self.image_size, self.image_size
+            # print(image.size, (best_width, best_height)) # check the select_best_resolutions func
             """process the global view"""
-            if self.image_size <= 640 and not cropping:
-                image = image.resize((self.image_size, self.image_size))
             global_view = ImageOps.pad(
                 image,
-                (self.base_size, self.base_size),
+                (self.image_size, self.image_size),
                 color=tuple(int(x * 255) for x in self.image_transform.mean),
             )
             images_list.append(self.image_transform(global_view))
-            num_width_tiles, num_height_tiles = crop_ratio
-            images_spatial_crop.append([num_width_tiles, num_height_tiles])
+            """process the local views"""
+            local_view = ImageOps.pad(
+                image,
+                (best_width, best_height),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
+            for i in range(0, best_height, self.image_size):
+                for j in range(0, best_width, self.image_size):
+                    images_list.append(
+                        self.image_transform(
+                            local_view.crop(
+                                (j, i, j + self.image_size, i + self.image_size)
+                            )
+                        )
+                    )
-            if num_width_tiles > 1 or num_height_tiles > 1:
-                for i in range(len(images_crop_raw)):
-                    images_crop_list.append(self.image_transform(images_crop_raw[i]))
+            """record height / width crop num"""
+            num_width_tiles, num_height_tiles = (
+                best_width // self.image_size,
+                best_height // self.image_size,
+            )
+            images_spatial_crop.append([num_width_tiles, num_height_tiles])
             """add image tokens"""
-            num_queries = math.ceil(
+            h = w = math.ceil(
                 (self.image_size // self.patch_size) / self.downsample_ratio
             )
-            num_queries_base = math.ceil(
-                (self.base_size // self.patch_size) / self.downsample_ratio
+            # global views tokens h * (w + 1), 1 is for line separator
+            tokenized_image = [self.image_token_id] * h * (w + 1)
+            # add a separator between global and local views
+            tokenized_image += [self.image_token_id]
+            # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
+            tokenized_image += (
+                [self.image_token_id]
+                * (num_height_tiles * h)
+                * (num_width_tiles * w + 1)
             )
-            tokenized_image = (
-                [self.image_token_id] * num_queries_base + [self.image_token_id]
-            ) * num_queries_base
-            tokenized_image += [self.image_token_id]
-            if num_width_tiles > 1 or num_height_tiles > 1:
-                tokenized_image += (
-                    [self.image_token_id] * (num_queries * num_width_tiles)
-                    + [self.image_token_id]
-                ) * (num_queries * num_height_tiles)
             tokenized_str += tokenized_image
             images_seq_mask += [True] * len(tokenized_image)
-            num_image_tokens.append(len(tokenized_image))
+            # print(width_crop_num, height_crop_num, len(tokenized_image)) # test the correctness of the number of image-related tokens
         """process the last text split"""
         tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
+        # deal with video, limit with request len
+        if max_req_input_len > -1:
+            if max_req_input_len < len(tokenized_sep) + len(tokenized_str) - 1:
+                rest = max_req_input_len - len(tokenized_sep) - 1 - 1024
+                tokenized_str = tokenized_str[:rest]
+                images_seq_mask = images_seq_mask[:rest]
         tokenized_str += tokenized_sep
         images_seq_mask += [False] * len(tokenized_sep)
@@ -505,64 +462,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
             images_seq_mask
         ), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
-        masked_tokenized_str = []
-        for token_index in tokenized_str:
-            if token_index != self.image_token_id:
-                masked_tokenized_str.append(token_index)
-            else:
-                masked_tokenized_str.append(self.ignore_id)
-        assert (
-            len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
-        ), (
-            f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
-            f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
-        )
-        input_ids = torch.LongTensor(tokenized_str)
-        target_ids = torch.LongTensor(masked_tokenized_str)
-        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
-        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
-        target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
-            self.ignore_id
-        )
-        input_ids[input_ids < 0] = self.pad_id
-        inference_mode = True
-        if inference_mode:
-            # Remove the ending eos token
-            assert input_ids[-1] == self.eos_id
-            input_ids = input_ids[:-1]
-            target_ids = target_ids[:-1]
-            images_seq_mask = images_seq_mask[:-1]
-        if len(images_list) == 0:
-            pixel_values = torch.zeros((1, 3, self.base_size, self.base_size))
-            images_spatial_crop = torch.zeros((1, 1), dtype=torch.long)
-            images_crop = torch.zeros(
-                (1, 3, self.image_size, self.image_size)
-            ).unsqueeze(0)
-        else:
-            pixel_values = torch.stack(images_list, dim=0)
-            images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
-            if images_crop_list:
-                images_crop = torch.stack(images_crop_list, dim=0).unsqueeze(0)
-            else:
-                images_crop = torch.zeros(
-                    (1, 3, self.image_size, self.image_size)
-                ).unsqueeze(0)
-        input_ids = input_ids.unsqueeze(0)
-        return (
-            input_ids,
-            pixel_values,
-            images_crop,
-            images_seq_mask,
-            images_spatial_crop,
-            num_image_tokens,
-            image_shapes,
-        )
+        return tokenized_str, images_list, images_seq_mask, images_spatial_crop
 class DeepseekVL2VisionEncoderConfig(PretrainedConfig):
@@ -647,6 +547,7 @@ class DeepseekVL2MlpProjectorConfig(PretrainedConfig):
 class DeepseekV2Config(PretrainedConfig):
     model_type = "deepseek_v2"
     keys_to_ignore_at_inference = ["past_key_values"]

sglang/srt/configs/kimi_linear.py ADDED Viewed

@@ -0,0 +1,160 @@
+# Adapted from: https://github.com/vllm-project/vllm/blob/0384aa7150c4c9778efca041ffd1beb3ad2bd694/vllm/transformers_utils/configs/kimi_linear.py
+from transformers.configuration_utils import PretrainedConfig
+from sglang.srt.configs.mamba_utils import KimiLinearCacheParams, KimiLinearStateShape
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+class KimiLinearConfig(PretrainedConfig):
+    model_type = "kimi_linear"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        model_type="kimi_linear",
+        vocab_size=163840,
+        hidden_size=4096,
+        head_dim=None,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        tie_word_embeddings=False,
+        moe_intermediate_size: int | None = None,
+        moe_renormalize: bool = True,
+        moe_router_activation_func: str = "sigmoid",
+        num_experts: int | None = None,
+        num_experts_per_token: int | None = None,
+        num_shared_experts: int = 0,
+        routed_scaling_factor: float = 1.0,
+        first_k_dense_replace: int = 0,
+        moe_layer_freq: int = 1,
+        use_grouped_topk: bool = True,
+        num_expert_group: int = 1,
+        topk_group: int = 1,
+        q_lora_rank: int | None = None,
+        kv_lora_rank: int | None = None,
+        qk_nope_head_dim: int | None = None,
+        qk_rope_head_dim: int | None = None,
+        v_head_dim: int | None = None,
+        mla_use_nope: bool | None = False,
+        num_nextn_predict_layers: int = 0,
+        linear_attn_config: dict | None = None,
+        **kwargs,
+    ):
+        self.model_type = model_type
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.head_dim = (
+            head_dim if head_dim is not None else hidden_size // num_attention_heads
+        )
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.mla_use_nope = mla_use_nope
+        # moe config
+        self.n_routed_experts = self.num_experts = num_experts
+        self.num_experts_per_token = num_experts_per_token
+        self.moe_renormalize = moe_renormalize
+        self.num_shared_experts = num_shared_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.moe_router_activation_func = moe_router_activation_func
+        assert self.moe_router_activation_func in ("softmax", "sigmoid")
+        self.moe_intermediate_size = moe_intermediate_size
+        self.first_k_dense_replace = first_k_dense_replace
+        self.moe_layer_freq = moe_layer_freq
+        self.use_grouped_topk = use_grouped_topk
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.num_nextn_predict_layers = num_nextn_predict_layers
+        if linear_attn_config is not None:
+            assert linear_attn_config["kda_layers"] is not None
+            assert linear_attn_config["full_attn_layers"] is not None
+        self.linear_attn_config = linear_attn_config
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    @property
+    def is_mla(self):
+        return (
+            self.q_lora_rank is not None
+            or self.kv_lora_rank is not None
+            or self.qk_nope_head_dim is not None
+            or self.qk_rope_head_dim is not None
+            or self.v_head_dim is not None
+            or self.mla_use_nope is True
+        )
+    @property
+    def is_moe(self):
+        return self.num_experts is not None
+    @property
+    def is_linear_attn(self) -> bool:
+        return not (
+            self.linear_attn_config is None
+            or (
+                isinstance(self.linear_attn_config, dict)
+                and self.linear_attn_config["kda_layers"] is not None
+                and len(self.linear_attn_config["kda_layers"]) == 0
+            )
+        )
+    def is_kda_layer(self, layer_idx: int):
+        return (
+            self.linear_attn_config is not None
+            and (layer_idx + 1) in self.linear_attn_config["kda_layers"]
+        )
+    @property
+    def linear_layer_ids(self):
+        return [i for i in range(self.num_hidden_layers) if self.is_kda_layer(i)]
+    @property
+    def full_attention_layer_ids(self):
+        return [i for i in range(self.num_hidden_layers) if not self.is_kda_layer(i)]
+    @property
+    def mamba2_cache_params(self) -> KimiLinearCacheParams:
+        shape = KimiLinearStateShape.create(
+            tp_world_size=get_attention_tp_size(),
+            num_heads=self.linear_attn_config["num_heads"],
+            head_dim=self.linear_attn_config["head_dim"],
+            conv_kernel_size=self.linear_attn_config["short_conv_kernel_size"],
+        )
+        return KimiLinearCacheParams(shape=shape, layers=self.linear_layer_ids)

sglang/srt/configs/mamba_utils.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import os
 from dataclasses import dataclass, field
+from typing import List, Optional
 import numpy as np
 import torch
@@ -115,3 +116,68 @@ class Mamba2CacheParams:
             int(np.prod(self.shape.conv)) * self.dtype.conv.itemsize
             + int(np.prod(self.shape.temporal)) * self.dtype.temporal.itemsize
         ) * len(self.layers)
+@dataclass(kw_only=True, frozen=True)
+class KimiLinearStateShape:
+    conv: List[tuple[int, int]]
+    temporal: tuple[int, int, int]
+    num_heads: int
+    head_dim: int
+    num_k_heads: int
+    head_k_dim: int
+    conv_kernel: int
+    num_spec: int
+    @staticmethod
+    def create(
+        *,
+        tp_world_size: int,
+        num_heads: int,
+        head_dim: int,
+        num_k_heads: Optional[int] = None,
+        head_k_dim: Optional[int] = None,
+        conv_kernel_size: int = 4,
+        num_spec: int = 0,
+    ) -> "KimiLinearStateShape":
+        if num_k_heads is None:
+            num_k_heads = num_heads
+        if head_k_dim is None:
+            head_k_dim = head_dim
+        proj_size = num_heads * head_dim
+        proj_k_size = num_k_heads * head_k_dim
+        conv_state_shape = (divide(proj_size, tp_world_size), conv_kernel_size - 1)
+        conv_state_k_shape = (divide(proj_k_size, tp_world_size), conv_kernel_size - 1)
+        temporal_state_shape = (divide(num_heads, tp_world_size), head_dim, head_dim)
+        conv_state_shape = conv_state_shape[1], conv_state_shape[0]
+        conv_state_k_shape = conv_state_k_shape[1], conv_state_k_shape[0]
+        return KimiLinearStateShape(
+            conv=[conv_state_shape, conv_state_k_shape, conv_state_k_shape],
+            temporal=temporal_state_shape,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            num_k_heads=num_k_heads,
+            head_k_dim=head_k_dim,
+            conv_kernel=conv_kernel_size,
+            num_spec=num_spec,
+        )
+@dataclass(kw_only=True, frozen=True)
+class KimiLinearCacheParams:
+    shape: KimiLinearStateShape
+    dtype: Mamba2StateDType = field(default_factory=mamba2_state_dtype)
+    layers: list[int]
+    @property
+    def mamba_cache_per_req(self) -> int:
+        return (
+            int(np.sum([np.prod(conv_shape) for conv_shape in self.shape.conv]))
+            * self.dtype.conv.itemsize
+            + int(np.prod(self.shape.temporal)) * self.dtype.temporal.itemsize
+        ) * len(self.layers)

sglang 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl