PyPI - sglang - Versions diffs - 0.4.6.post2__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl - Mend

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

sglang/bench_one_batch.py +1 -11
sglang/bench_serving.py +149 -1
sglang/lang/chat_template.py +44 -0
sglang/srt/configs/deepseekvl2.py +3 -0
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/internvl.py +696 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +17 -0
sglang/srt/constrained/xgrammar_backend.py +11 -19
sglang/srt/conversation.py +30 -3
sglang/srt/disaggregation/decode.py +4 -1
sglang/srt/disaggregation/mini_lb.py +74 -23
sglang/srt/disaggregation/mooncake/conn.py +9 -18
sglang/srt/disaggregation/nixl/conn.py +241 -71
sglang/srt/disaggregation/utils.py +44 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
sglang/srt/distributed/device_communicators/pynccl.py +2 -1
sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
sglang/srt/distributed/parallel_state.py +22 -1
sglang/srt/entrypoints/engine.py +14 -2
sglang/srt/entrypoints/http_server.py +28 -1
sglang/srt/entrypoints/verl_engine.py +3 -2
sglang/srt/hf_transformers_utils.py +20 -1
sglang/srt/layers/attention/flashattention_backend.py +146 -50
sglang/srt/layers/attention/flashinfer_backend.py +23 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
sglang/srt/layers/attention/merge_state.py +46 -0
sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
sglang/srt/layers/attention/vision.py +290 -163
sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
sglang/srt/layers/moe/ep_moe/layer.py +120 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +4 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
sglang/srt/layers/quantization/deep_gemm.py +5 -0
sglang/srt/layers/quantization/fp8.py +108 -95
sglang/srt/layers/quantization/fp8_kernel.py +79 -60
sglang/srt/layers/quantization/fp8_utils.py +71 -23
sglang/srt/layers/quantization/kv_cache.py +3 -10
sglang/srt/layers/quantization/utils.py +0 -5
sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
sglang/srt/lora/lora_manager.py +10 -13
sglang/srt/managers/cache_controller.py +115 -119
sglang/srt/managers/io_struct.py +10 -0
sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
sglang/srt/managers/multimodal_processors/internvl.py +232 -0
sglang/srt/managers/schedule_batch.py +19 -1
sglang/srt/managers/schedule_policy.py +11 -5
sglang/srt/managers/scheduler.py +28 -13
sglang/srt/managers/tokenizer_manager.py +24 -13
sglang/srt/managers/tp_worker.py +9 -12
sglang/srt/mem_cache/chunk_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +2 -2
sglang/srt/model_executor/model_runner.py +44 -33
sglang/srt/model_loader/loader.py +18 -11
sglang/srt/models/clip.py +4 -4
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_nextn.py +1 -20
sglang/srt/models/deepseek_v2.py +55 -20
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/internlm2.py +3 -0
sglang/srt/models/internvl.py +670 -0
sglang/srt/models/llama.py +1 -1
sglang/srt/models/llama4.py +53 -7
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/phi3_small.py +16 -2
sglang/srt/models/qwen2_5_vl.py +8 -4
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/xiaomi_mimo.py +171 -0
sglang/srt/openai_api/adapter.py +24 -40
sglang/srt/openai_api/protocol.py +28 -16
sglang/srt/reasoning_parser.py +2 -2
sglang/srt/sampling/sampling_batch_info.py +54 -2
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +30 -6
sglang/srt/utils.py +35 -1
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deepep_utils.py +219 -0
sglang/test/test_utils.py +3 -1
sglang/version.py +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +14 -6
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +90 -80
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0

sglang/bench_one_batch.py CHANGED Viewed

@@ -137,17 +137,7 @@ def load_model(server_args, port_args, tp_rank):
     suppress_other_loggers()
     rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
-    model_config = ModelConfig(
-        server_args.model_path,
-        trust_remote_code=server_args.trust_remote_code,
-        revision=server_args.revision,
-        context_length=server_args.context_length,
-        model_override_args=server_args.json_model_override_args,
-        is_embedding=server_args.is_embedding,
-        enable_multimodal=server_args.enable_multimodal,
-        dtype=server_args.dtype,
-        quantization=server_args.quantization,
-    )
+    model_config = ModelConfig.from_server_args(server_args)
     model_runner = ModelRunner(
         model_config=model_config,
         mem_fraction_static=server_args.mem_fraction_static,

sglang/bench_serving.py CHANGED Viewed

@@ -58,6 +58,7 @@ class RequestFuncInput:
     output_len: int
     model: str
     lora_name: str
+    image_data: str
     extra_request_body: Dict[str, Any]
@@ -347,6 +348,11 @@ async def async_request_sglang_generate(
             "logprob_start_len": -1,
             **request_func_input.extra_request_body,
         }
+        # Add image data if available
+        if request_func_input.image_data:
+            payload["image_data"] = request_func_input.image_data
         headers = get_auth_headers()
         output = RequestFuncOutput()
@@ -510,6 +516,13 @@ def get_dataset(args, tokenizer):
             tokenizer=tokenizer,
             args=args,
         )
+    elif args.dataset_name == "mmmu":
+        input_requests = sample_mmmu_requests(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.random_output_len,
+            random_sample=True,
+        )
     else:
         raise ValueError(f"Unknown dataset: {args.dataset_name}")
     return input_requests
@@ -597,6 +610,121 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
     return filename
+def sample_mmmu_requests(
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int] = None,
+    random_sample: bool = True,
+) -> List[Tuple[str, int, int]]:
+    """
+    Sample requests from the MMMU dataset using HuggingFace datasets.
+    Args:
+        num_requests: Number of requests to sample.
+        tokenizer: Tokenizer to use for token counting.
+        fixed_output_len: If provided, use this fixed output length for all requests.
+        random_sample: Whether to randomly sample or take the first N.
+    Returns:
+        List of tuples (prompt, prompt_token_len, output_token_len).
+    """
+    try:
+        import base64
+        import io
+        from datasets import load_dataset
+    except ImportError:
+        raise ImportError("Please install datasets: pip install datasets")
+    print("Loading MMMU dataset from HuggingFace...")
+    try:
+        print("Attempting to load MMMU Math dataset...")
+        mmmu_dataset = load_dataset("MMMU/MMMU", "Math", split="test")
+        print(
+            f"Successfully loaded MMMU Math dataset from HuggingFace with {len(mmmu_dataset)} examples"
+        )
+    except Exception as e:
+        print(f"Failed to load MMMU Math dataset: {e}")
+        raise ValueError(f"Failed to load MMMU dataset: {e}")
+    # Sample from the dataset
+    if len(mmmu_dataset) > num_requests:
+        if random_sample:
+            # Random sample
+            indices = random.sample(range(len(mmmu_dataset)), num_requests)
+            sample_dataset = mmmu_dataset.select(indices)
+        else:
+            # Take first N
+            sample_dataset = mmmu_dataset.select(
+                range(min(num_requests, len(mmmu_dataset)))
+            )
+    else:
+        print(f"Dataset has less than {num_requests} examples, using all examples")
+        sample_dataset = mmmu_dataset
+    print(f"Selected {len(sample_dataset)} examples for benchmarking")
+    # Create prompts
+    filtered_dataset = []
+    for i, example in enumerate(sample_dataset):
+        try:
+            # Extract image_1
+            image = example.get("image_1")
+            if image is not None:
+                if hasattr(image, "save"):
+                    # Convert RGBA images to RGB before encoding
+                    if image.mode == "RGBA":
+                        image = image.convert("RGB")
+                    # Encode image to base64
+                    buffered = io.BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                    image_path = f"data:image/jpeg;base64,{img_str}"
+                else:
+                    continue
+                # Extract the question
+                question = example.get("question")
+                # Create the prompt with image, question
+                prompt = f"Question: {question}\n\nAnswer: "
+                prompt = tokenizer.apply_chat_template(
+                    [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "image_url", "image_url": {"url": image_path}},
+                                {"type": "text", "text": prompt},
+                            ],
+                        }
+                    ],
+                    add_generation_prompt=True,
+                    tokenize=False,
+                )
+                prompt = f"<image>{image_path}</image>{prompt}"
+                # Calculate token lengths
+                # Note: This is approximate since we're not rendering the actual image tokens
+                prompt_token_ids = tokenizer.encode(prompt)
+                prompt_len = (
+                    len(prompt_token_ids) + 512
+                )  # Add estimate for image tokens
+                output_len = fixed_output_len if fixed_output_len is not None else 256
+                filtered_dataset.append((prompt, prompt_len, output_len))
+        except Exception as e:
+            print(f"Error processing example {i}: {e}")
+    print(f"\nCreated {len(filtered_dataset)} MMMU prompts")
+    return filtered_dataset
 def sample_sharegpt_requests(
     dataset_path: str,
     num_requests: int,
@@ -1004,6 +1132,15 @@ async def benchmark(
     else:
         lora_name = None
+    if "<image>" in test_prompt:
+        import re
+        image_match = re.search(r"<image>(.*?)</image>(.*)", test_prompt)
+        image_data = image_match.group(1) if image_match else None
+        test_prompt = image_match.group(2) if image_match else test_prompt
+    else:
+        image_data = None
     # Create the test input once
     test_input = RequestFuncInput(
         model=model_id,
@@ -1012,6 +1149,7 @@ async def benchmark(
         prompt_len=test_prompt_len,
         output_len=min(test_output_len, 32),
         lora_name=lora_name,
+        image_data=image_data,
         extra_request_body=extra_request_body,
     )
@@ -1063,6 +1201,15 @@ async def benchmark(
         else:
             lora_name = None
+        if "<image>" in prompt:
+            import re
+            image_match = re.search(r"<image>(.*?)</image>(.*)", prompt)
+            image_data = image_match.group(1) if image_match else None
+            prompt = image_match.group(2) if image_match else prompt
+        else:
+            image_data = None
         request_func_input = RequestFuncInput(
             model=model_id,
             prompt=prompt,
@@ -1070,6 +1217,7 @@ async def benchmark(
             prompt_len=prompt_len,
             output_len=output_len,
             lora_name=lora_name,
+            image_data=image_data,
             extra_request_body=extra_request_body,
         )
         tasks.append(
@@ -1444,7 +1592,7 @@ if __name__ == "__main__":
         "--dataset-name",
         type=str,
         default="sharegpt",
-        choices=["sharegpt", "random", "random-ids", "generated-shared-prefix"],
+        choices=["sharegpt", "random", "random-ids", "generated-shared-prefix", "mmmu"],
         help="Name of the dataset to benchmark on.",
     )
     parser.add_argument(

sglang/lang/chat_template.py CHANGED Viewed

@@ -270,6 +270,29 @@ register_chat_template(
     )
 )
+register_chat_template(
+    ChatTemplate(
+        name="janus",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "",
+                "",
+            ),
+            "user": (
+                "<｜User｜>",
+                "",
+            ),
+            "assistant": (
+                "<｜Assistant｜>",
+                "<｜end▁of▁sentence｜>",
+            ),
+        },
+        stop_str=("<｜end▁of▁sentence｜>",),
+        image_token="<image_placeholder>\n",
+    )
+)
 # The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
 register_chat_template(
     ChatTemplate(
@@ -395,6 +418,20 @@ register_chat_template(
     )
 )
+# Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
+register_chat_template(
+    ChatTemplate(
+        name="internvl-2-5",
+        default_system_prompt="你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        stop_str=["<|im_end|>", "<|action_end|>"],
+    )
+)
 register_chat_template(
     ChatTemplate(
         name="granite-3-instruct",
@@ -565,6 +602,13 @@ def match_gemma3_instruct(model_path: str):
         return get_chat_template("gemma-it")
+@register_chat_template_matching_function
+def match_internvl_chat(model_path: str):
+    model_path = model_path.lower()
+    if "internvl" in model_path:
+        return get_chat_template("internvl-2-5")
 if __name__ == "__main__":
     messages = [
         {"role": "system", "content": None},  # None means default

sglang/srt/configs/deepseekvl2.py CHANGED Viewed

@@ -48,6 +48,9 @@ class DictOutput(object):
     def __getitem__(self, item):
         return self.__dict__[item]
+    def __contains__(self, key):
+        return key in self.__dict__
     def __setitem__(self, key, value):
         self.__dict__[key] = value

sglang/srt/configs/device_config.py CHANGED Viewed

@@ -10,7 +10,7 @@ class DeviceConfig:
     device: Optional[torch.device]
     def __init__(self, device: str = "cuda") -> None:
-        if device in ["cuda", "xpu", "hpu", "cpu"]:
+        if device in ["cuda", "xpu", "hpu", "cpu", "npu"]:
             self.device_type = device
         else:
             raise RuntimeError(f"Not supported device type: {device}")

sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post3py3-none-any.whl