PyPI - sglang - Versions diffs - 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl - Mend

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_offline_throughput.py +4 -2
sglang/bench_one_batch.py +3 -13
sglang/bench_one_batch_server.py +143 -15
sglang/bench_serving.py +158 -8
sglang/compile_deep_gemm.py +1 -1
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +119 -75
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +5 -2
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/internvl.py +696 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +18 -0
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +71 -53
sglang/srt/conversation.py +78 -46
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +11 -3
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +74 -23
sglang/srt/disaggregation/mooncake/conn.py +236 -138
sglang/srt/disaggregation/nixl/conn.py +242 -71
sglang/srt/disaggregation/prefill.py +7 -4
sglang/srt/disaggregation/utils.py +51 -2
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
sglang/srt/distributed/device_communicators/pynccl.py +2 -1
sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
sglang/srt/distributed/parallel_state.py +22 -1
sglang/srt/entrypoints/engine.py +31 -4
sglang/srt/entrypoints/http_server.py +45 -3
sglang/srt/entrypoints/verl_engine.py +3 -2
sglang/srt/function_call_parser.py +2 -2
sglang/srt/hf_transformers_utils.py +20 -1
sglang/srt/layers/attention/flashattention_backend.py +147 -51
sglang/srt/layers/attention/flashinfer_backend.py +23 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
sglang/srt/layers/attention/merge_state.py +46 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
sglang/srt/layers/attention/utils.py +4 -2
sglang/srt/layers/attention/vision.py +290 -163
sglang/srt/layers/dp_attention.py +71 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
sglang/srt/layers/moe/ep_moe/layer.py +121 -2
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/topk.py +1 -1
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
sglang/srt/layers/quantization/deep_gemm.py +77 -71
sglang/srt/layers/quantization/fp8.py +110 -97
sglang/srt/layers/quantization/fp8_kernel.py +81 -62
sglang/srt/layers/quantization/fp8_utils.py +71 -23
sglang/srt/layers/quantization/int8_kernel.py +2 -2
sglang/srt/layers/quantization/kv_cache.py +3 -10
sglang/srt/layers/quantization/utils.py +0 -5
sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +11 -14
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +115 -119
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/io_struct.py +13 -1
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
sglang/srt/managers/multimodal_processors/internvl.py +232 -0
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/schedule_batch.py +93 -23
sglang/srt/managers/schedule_policy.py +11 -8
sglang/srt/managers/scheduler.py +140 -100
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/tokenizer_manager.py +157 -47
sglang/srt/managers/tp_worker.py +21 -21
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/chunk_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +4 -2
sglang/srt/metrics/collector.py +312 -37
sglang/srt/model_executor/cuda_graph_runner.py +10 -11
sglang/srt/model_executor/forward_batch_info.py +1 -1
sglang/srt/model_executor/model_runner.py +57 -41
sglang/srt/model_loader/loader.py +18 -11
sglang/srt/models/clip.py +4 -4
sglang/srt/models/deepseek_janus_pro.py +3 -3
sglang/srt/models/deepseek_nextn.py +1 -20
sglang/srt/models/deepseek_v2.py +77 -39
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/internlm2.py +3 -0
sglang/srt/models/internvl.py +670 -0
sglang/srt/models/llama.py +3 -1
sglang/srt/models/llama4.py +58 -13
sglang/srt/models/llava.py +248 -5
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/phi3_small.py +16 -2
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/qwen2_5_vl.py +8 -4
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/models/xiaomi_mimo.py +171 -0
sglang/srt/openai_api/adapter.py +52 -42
sglang/srt/openai_api/protocol.py +20 -16
sglang/srt/reasoning_parser.py +1 -1
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +2 -2
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +64 -10
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +7 -7
sglang/srt/speculative/eagle_worker.py +22 -19
sglang/srt/utils.py +41 -6
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deepep_utils.py +219 -0
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +92 -15
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0

sglang/eval/loogle_eval.py ADDED Viewed

@@ -0,0 +1,157 @@
+import argparse
+import asyncio
+import os
+import pickle
+from pathlib import Path
+from typing import List
+import openai
+import torch
+from bert_score import BERTScorer
+from datasets import load_dataset
+from tqdm import tqdm
+def get_client(api_url: str) -> openai.AsyncOpenAI:
+    if os.getenv("OPENAI_API_KEY") is None:
+        os.environ["OPENAI_API_KEY"] = "EMPTY"
+    return openai.AsyncOpenAI(base_url=api_url)
+def get_dataset():
+    return load_dataset("bigai-nlco/LooGLE", "longdep_qa", split="test")
+async def fetch_response(
+    client: openai.AsyncOpenAI,
+    context: str,
+    question: str,
+    semaphore: asyncio.Semaphore,
+    index: int,
+    model: str,
+    output_dir: Path,
+):
+    output_file = output_dir / f"response_{index}.pkl"
+    if output_file.exists():
+        return
+    prompt = (
+        "Please answer the question based on the long texts below.\n"
+        f"{context}\n"
+        f"Question: {question}\n"
+        "Answer:"
+    )
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt},
+    ]
+    async with semaphore:
+        try:
+            response = await client.chat.completions.create(
+                model=model,
+                messages=messages,
+                temperature=0.0,
+                max_tokens=512,
+            )
+        except openai.BadRequestError as e:
+            with open(output_file, "wb") as f:
+                pickle.dump({"error": str(e)}, f)
+            return
+    with open(output_file, "wb") as f:
+        pickle.dump(response, f)
+async def benchmark(args):
+    dataset = get_dataset()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    client = get_client(args.api_url)
+    semaphore = asyncio.Semaphore(args.max_concurrency)
+    tasks: List[asyncio.Task] = []
+    for idx, ex in enumerate(dataset):
+        tasks.append(
+            asyncio.create_task(
+                fetch_response(
+                    client,
+                    ex["context"],
+                    ex["question"],
+                    semaphore,
+                    idx,
+                    args.model,
+                    output_dir,
+                )
+            )
+        )
+    for _ in tqdm(
+        asyncio.as_completed(tasks), total=len(tasks), desc="Running benchmark"
+    ):
+        await _
+def analyse(args):
+    dataset = get_dataset()
+    output_dir = Path(args.output_dir)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    scorer = BERTScorer(lang="en", device=device)
+    hyps: List[str] = []
+    refs: List[str] = []
+    for idx, ex in enumerate(tqdm(dataset, desc="Loading responses")):
+        pkl_file = output_dir / f"response_{idx}.pkl"
+        if not pkl_file.exists():
+            raise FileNotFoundError(pkl_file)
+        response = pickle.load(open(pkl_file, "rb"))
+        if isinstance(response, dict) and "error" in response:
+            continue
+        hyps.append(response.choices[0].message.content.strip())
+        refs.append(ex["answer"])
+    if not hyps:
+        print("No valid responses to score!")
+        return
+    batch_size = 64
+    all_f1: List[float] = []
+    for i in tqdm(range(0, len(hyps), batch_size), desc="Scoring batches"):
+        h_batch = hyps[i : i + batch_size]
+        r_batch = refs[i : i + batch_size]
+        _, _, f1_scores = scorer.score(h_batch, r_batch, verbose=False)
+        all_f1.extend([float(x) for x in f1_scores])
+    avg = sum(all_f1) / len(all_f1)
+    print(f"Average BERTScore (F1): {avg:.2%}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Run benchmark and evaluation in one go."
+    )
+    parser.add_argument(
+        "--api-url",
+        default="http://127.0.0.1:30000/v1",
+        help="OpenAI‑compatible API base URL",
+    )
+    parser.add_argument(
+        "--model",
+        default="meta-llama/Llama-4-Maverick-17B-128E-Instruct",
+        help="Model name or ID, only used for model name",
+    )
+    parser.add_argument(
+        "--max-concurrency", type=int, default=144, help="Maximum concurrent requests"
+    )
+    parser.add_argument(
+        "--output-dir", default="tmp-output-dir", help="Directory for cached responses"
+    )
+    args = parser.parse_args()
+    asyncio.run(benchmark(args))
+    analyse(args)

sglang/lang/chat_template.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import re
 from dataclasses import dataclass
 from enum import Enum, auto
 from typing import Callable, Dict, List, Tuple
@@ -71,9 +72,9 @@ def get_chat_template(name):
 def get_chat_template_by_model_path(model_path):
     for matching_func in matching_function_registry:
-        template = matching_func(model_path)
-        if template is not None:
-            return template
+        template_name = matching_func(model_path)
+        if template_name is not None:
+            return get_chat_template(template_name)
     return get_chat_template("default")
@@ -193,6 +194,21 @@ register_chat_template(
     )
 )
+# Reference: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/blob/main/chat_template.json
+register_chat_template(
+    ChatTemplate(
+        name="mistral",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("[SYSTEM_PROMPT] ", " [/SYSTEM_PROMPT]"),
+            "user": ("[INST] ", " [/INST]"),
+            "assistant": ("", " </s><s>"),
+        },
+        stop_str=("</s>",),
+        image_token="[IMG]",
+    )
+)
 register_chat_template(
     ChatTemplate(
         name="llama-3-instruct",
@@ -270,6 +286,29 @@ register_chat_template(
     )
 )
+register_chat_template(
+    ChatTemplate(
+        name="janus",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "",
+                "",
+            ),
+            "user": (
+                "<｜User｜>",
+                "",
+            ),
+            "assistant": (
+                "<｜Assistant｜>",
+                "<｜end▁of▁sentence｜>",
+            ),
+        },
+        stop_str=("<｜end▁of▁sentence｜>",),
+        image_token="<image_placeholder>\n",
+    )
+)
 # The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
 register_chat_template(
     ChatTemplate(
@@ -395,6 +434,20 @@ register_chat_template(
     )
 )
+# Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
+register_chat_template(
+    ChatTemplate(
+        name="internvl-2-5",
+        default_system_prompt="你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        stop_str=["<|im_end|>", "<|action_end|>"],
+    )
+)
 register_chat_template(
     ChatTemplate(
         name="granite-3-instruct",
@@ -442,127 +495,118 @@ register_chat_template(
 @register_chat_template_matching_function
 def match_deepseek(model_path: str):
-    if (
-        "deepseek-v3" in model_path.lower() or "deepseek-r1" in model_path.lower()
-    ) and "base" not in model_path.lower():
-        return get_chat_template("deepseek-v3")
+    if re.search(r"deepseek-(v3|r1)", model_path, re.IGNORECASE) and not re.search(
+        r"base", model_path, re.IGNORECASE
+    ):
+        return "deepseek-v3"
 @register_chat_template_matching_function
 def match_deepseek_janus_pro(model_path: str):
-    if "janus" in model_path.lower():
-        return get_chat_template("janus-pro")
+    if re.search(r"janus", model_path, re.IGNORECASE):
+        return "janus-pro"
 @register_chat_template_matching_function
 def match_dbrx(model_path: str):
-    if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
-        return get_chat_template("dbrx-instruct")
+    if re.search(r"dbrx", model_path, re.IGNORECASE) and re.search(
+        r"instruct", model_path, re.IGNORECASE
+    ):
+        return "dbrx-instruct"
 @register_chat_template_matching_function
 def match_vicuna(model_path: str):
-    if "vicuna" in model_path.lower():
-        return get_chat_template("vicuna_v1.1")
-    if "llava-v1.5" in model_path.lower():
-        return get_chat_template("vicuna_v1.1")
-    if "llava-next-video-7b" in model_path.lower():
-        return get_chat_template("vicuna_v1.1")
+    if re.search(r"vicuna|llava-v1\.5|llava-next-video-7b", model_path, re.IGNORECASE):
+        return "vicuna_v1.1"
 @register_chat_template_matching_function
 def match_llama2_chat(model_path: str):
-    model_path = model_path.lower()
-    if "llama-2" in model_path and "chat" in model_path:
-        return get_chat_template("llama-2-chat")
-    if (
-        "mistral" in model_path or "mixtral" in model_path
-    ) and "instruct" in model_path:
-        return get_chat_template("llama-2-chat")
-    if "codellama" in model_path and "instruct" in model_path:
-        return get_chat_template("llama-2-chat")
+    if re.search(
+        r"llama-2.*chat|codellama.*instruct",
+        model_path,
+        re.IGNORECASE,
+    ):
+        return "llama-2-chat"
+@register_chat_template_matching_function
+def match_mistral(model_path: str):
+    if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
+        return "mistral"
 @register_chat_template_matching_function
 def match_llama3_instruct(model_path: str):
-    model_path = model_path.lower()
-    if "llama-3" in model_path and "instruct" in model_path:
-        return get_chat_template("llama-3-instruct")
+    if re.search(r"llama-3.*instruct", model_path, re.IGNORECASE):
+        return "llama-3-instruct"
 @register_chat_template_matching_function
 def match_chat_ml(model_path: str):
-    # import pdb;pdb.set_trace()
-    model_path = model_path.lower()
-    if "tinyllama" in model_path:
-        return get_chat_template("chatml")
-    # Now the suffix for qwen2 chat model is "instruct"
-    if "qwen" in model_path and "vl" in model_path:
-        return get_chat_template("qwen2-vl")
-    if "qwen" in model_path:
-        if "vl" in model_path:
-            return get_chat_template("qwen2-vl")
-        if ("chat" in model_path or "instruct" in model_path) and (
-            "llava" not in model_path
-        ):
-            return get_chat_template("qwen")
-    if (
-        "llava-v1.6-34b" in model_path
-        or "llava-v1.6-yi-34b" in model_path
-        or "llava-next-video-34b" in model_path
-        or "llava-onevision-qwen2" in model_path
+    if re.search(r"tinyllama", model_path, re.IGNORECASE):
+        return "chatml"
+    if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
+        return "qwen2-vl"
+    if re.search(r"qwen.*(chat|instruct)", model_path, re.IGNORECASE) and not re.search(
+        r"llava", model_path, re.IGNORECASE
+    ):
+        return "qwen"
+    if re.search(
+        r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
+        model_path,
+        re.IGNORECASE,
     ):
-        return get_chat_template("chatml-llava")
+        return "chatml-llava"
 @register_chat_template_matching_function
 def match_chat_yi(model_path: str):
-    model_path = model_path.lower()
-    if "yi-vl" in model_path and "llava" not in model_path:
-        return get_chat_template("yi-vl")
-    elif "yi-1.5" in model_path and "chat" in model_path:
-        return get_chat_template("yi-1.5")
+    if re.search(r"yi-vl", model_path, re.IGNORECASE) and not re.search(
+        r"llava", model_path, re.IGNORECASE
+    ):
+        return "yi-vl"
+    elif re.search(r"yi-1\.5.*chat", model_path, re.IGNORECASE):
+        return "yi-1.5"
 @register_chat_template_matching_function
 def match_gemma_it(model_path: str):
-    model_path = model_path.lower()
-    if "gemma" in model_path and "it" in model_path:
-        return get_chat_template("gemma-it")
+    if re.search(r"gemma.*it", model_path, re.IGNORECASE):
+        return "gemma-it"
 @register_chat_template_matching_function
 def match_openbmb_minicpm(model_path: str):
-    model_path = model_path.lower()
-    if "minicpm-v" in model_path:
-        return get_chat_template("minicpmv")
-    elif "minicpm-o" in model_path:
-        return get_chat_template("minicpmo")
+    if re.search(r"minicpm-v", model_path, re.IGNORECASE):
+        return "minicpmv"
+    elif re.search(r"minicpm-o", model_path, re.IGNORECASE):
+        return "minicpmo"
 @register_chat_template_matching_function
 def match_c4ai_command_r(model_path: str):
-    model_path = model_path.lower()
-    if "c4ai-command-r" in model_path:
-        return get_chat_template("c4ai-command-r")
+    if re.search(r"c4ai-command-r", model_path, re.IGNORECASE):
+        return "c4ai-command-r"
 @register_chat_template_matching_function
 def match_granite_instruct(model_path: str):
-    model_path = model_path.lower()
-    # When future versions of Granite are released, this code may
-    # need to be updated. For now, assume that the Granite 3.0
-    # template works across the board.
-    if "granite" in model_path and "instruct" in model_path:
-        return get_chat_template("granite-3-instruct")
+    if re.search(r"granite.*instruct", model_path, re.IGNORECASE):
+        return "granite-3-instruct"
 @register_chat_template_matching_function
 def match_gemma3_instruct(model_path: str):
-    model_path = model_path.lower()
-    if "gemma-3" in model_path and "1b" not in model_path:
-        # gemma-3-1b-it is completion model
-        return get_chat_template("gemma-it")
+    if re.search(r"gemma-3", model_path, re.IGNORECASE):
+        return "gemma-it"
+@register_chat_template_matching_function
+def match_internvl_chat(model_path: str):
+    if re.search(r"internvl2_5", model_path, re.IGNORECASE):
+        return "internvl-2-5"
 if __name__ == "__main__":

sglang/lang/tracer.py CHANGED Viewed

@@ -38,7 +38,7 @@ def extract_prefix_by_tracing(program, backend):
         with TracingScope(tracer):
             tracer.ret_value = program.func(tracer, **arguments)
     except (StopTracing, TypeError, AttributeError):
-        # Some exceptions may not be catched
+        # Some exceptions may not be caught
         pass
     # Run and cache prefix

sglang/srt/code_completion_parser.py CHANGED Viewed

@@ -27,7 +27,7 @@ completion_template_name = None
 class FimPosition:
-    """Postion of fim middle token."""
+    """Position of fim middle token."""
     MIDDLE = auto()
     END = auto()

sglang/srt/configs/deepseekvl2.py CHANGED Viewed

@@ -48,6 +48,9 @@ class DictOutput(object):
     def __getitem__(self, item):
         return self.__dict__[item]
+    def __contains__(self, key):
+        return key in self.__dict__
     def __setitem__(self, key, value):
         self.__dict__[key] = value
@@ -413,9 +416,9 @@ class DeepseekVLV2Processor(ProcessorMixin):
             h = w = math.ceil(
                 (self.image_size // self.patch_size) / self.downsample_ratio
             )
-            # global views tokens h * (w + 1), 1 is for line seperator
+            # global views tokens h * (w + 1), 1 is for line separator
             tokenized_image = [self.image_token_id] * h * (w + 1)
-            # add a seperator between global and local views
+            # add a separator between global and local views
             tokenized_image += [self.image_token_id]
             # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
             tokenized_image += (

sglang/srt/configs/device_config.py CHANGED Viewed

@@ -10,7 +10,7 @@ class DeviceConfig:
     device: Optional[torch.device]
     def __init__(self, device: str = "cuda") -> None:
-        if device in ["cuda", "xpu", "hpu", "cpu"]:
+        if device in ["cuda", "xpu", "hpu", "cpu", "npu"]:
             self.device_type = device
         else:
             raise RuntimeError(f"Not supported device type: {device}")

sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post4py3-none-any.whl