PyPI - sglang - Versions diffs - 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

sglang 0.2.12py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

sglang/api.py +13 -1
sglang/bench_latency.py +10 -5
sglang/bench_serving.py +50 -26
sglang/check_env.py +15 -0
sglang/global_config.py +1 -1
sglang/lang/backend/runtime_endpoint.py +60 -49
sglang/lang/chat_template.py +10 -5
sglang/lang/compiler.py +4 -0
sglang/lang/interpreter.py +5 -2
sglang/lang/ir.py +22 -4
sglang/launch_server.py +8 -1
sglang/srt/constrained/jump_forward.py +13 -2
sglang/srt/conversation.py +50 -1
sglang/srt/hf_transformers_utils.py +22 -23
sglang/srt/layers/activation.py +24 -2
sglang/srt/layers/decode_attention.py +338 -50
sglang/srt/layers/extend_attention.py +3 -1
sglang/srt/layers/fused_moe/__init__.py +1 -0
sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
sglang/srt/layers/fused_moe/layer.py +587 -0
sglang/srt/layers/layernorm.py +3 -0
sglang/srt/layers/logits_processor.py +64 -27
sglang/srt/layers/radix_attention.py +41 -18
sglang/srt/layers/sampler.py +154 -0
sglang/srt/managers/controller_multi.py +2 -8
sglang/srt/managers/controller_single.py +7 -10
sglang/srt/managers/detokenizer_manager.py +20 -9
sglang/srt/managers/io_struct.py +44 -11
sglang/srt/managers/policy_scheduler.py +5 -2
sglang/srt/managers/schedule_batch.py +59 -179
sglang/srt/managers/tokenizer_manager.py +193 -84
sglang/srt/managers/tp_worker.py +131 -50
sglang/srt/mem_cache/memory_pool.py +82 -8
sglang/srt/mm_utils.py +79 -7
sglang/srt/model_executor/cuda_graph_runner.py +97 -28
sglang/srt/model_executor/forward_batch_info.py +188 -82
sglang/srt/model_executor/model_runner.py +269 -87
sglang/srt/models/chatglm.py +6 -14
sglang/srt/models/commandr.py +6 -2
sglang/srt/models/dbrx.py +5 -1
sglang/srt/models/deepseek.py +7 -3
sglang/srt/models/deepseek_v2.py +12 -7
sglang/srt/models/gemma.py +6 -2
sglang/srt/models/gemma2.py +22 -8
sglang/srt/models/gpt_bigcode.py +5 -1
sglang/srt/models/grok.py +66 -398
sglang/srt/models/internlm2.py +5 -1
sglang/srt/models/llama2.py +7 -3
sglang/srt/models/llama_classification.py +2 -2
sglang/srt/models/llama_embedding.py +4 -0
sglang/srt/models/llava.py +176 -59
sglang/srt/models/minicpm.py +7 -3
sglang/srt/models/mixtral.py +61 -255
sglang/srt/models/mixtral_quant.py +6 -5
sglang/srt/models/qwen.py +7 -4
sglang/srt/models/qwen2.py +15 -5
sglang/srt/models/qwen2_moe.py +7 -16
sglang/srt/models/stablelm.py +6 -2
sglang/srt/openai_api/adapter.py +149 -58
sglang/srt/sampling/sampling_batch_info.py +209 -0
sglang/srt/{sampling_params.py → sampling/sampling_params.py} +18 -4
sglang/srt/server.py +107 -71
sglang/srt/server_args.py +49 -15
sglang/srt/utils.py +27 -18
sglang/test/runners.py +38 -38
sglang/test/simple_eval_common.py +9 -10
sglang/test/simple_eval_gpqa.py +2 -1
sglang/test/simple_eval_humaneval.py +2 -2
sglang/test/simple_eval_math.py +2 -1
sglang/test/simple_eval_mmlu.py +2 -1
sglang/test/test_activation.py +55 -0
sglang/test/test_programs.py +32 -5
sglang/test/test_utils.py +37 -50
sglang/version.py +1 -1
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/METADATA +102 -27
sglang-0.2.14.dist-info/RECORD +114 -0
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/WHEEL +1 -1
sglang/launch_server_llavavid.py +0 -29
sglang/srt/model_loader/model_loader.py +0 -292
sglang/srt/model_loader/utils.py +0 -275
sglang-0.2.12.dist-info/RECORD +0 -112
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/LICENSE +0 -0
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -17,9 +17,12 @@ limitations under the License.
 import argparse
 import dataclasses
+import logging
 import random
 from typing import List, Optional, Union
+logger = logging.getLogger(__name__)
 @dataclasses.dataclass
 class ServerArgs:
@@ -30,11 +33,13 @@ class ServerArgs:
     skip_tokenizer_init: bool = False
     load_format: str = "auto"
     dtype: str = "auto"
+    kv_cache_dtype: str = "auto"
     trust_remote_code: bool = True
     context_length: Optional[int] = None
     quantization: Optional[str] = None
     served_model_name: Optional[str] = None
     chat_template: Optional[str] = None
+    is_embedding: bool = False
     # Port
     host: str = "127.0.0.1"
@@ -46,7 +51,7 @@ class ServerArgs:
     max_running_requests: Optional[int] = None
     max_num_reqs: Optional[int] = None
     max_total_tokens: Optional[int] = None
-    chunked_prefill_size: int = -1
+    chunked_prefill_size: int = 8192
     max_prefill_tokens: int = 16384
     schedule_policy: str = "lpm"
     schedule_conservativeness: float = 1.0
@@ -76,12 +81,14 @@ class ServerArgs:
     disable_radix_cache: bool = False
     disable_regex_jump_forward: bool = False
     disable_cuda_graph: bool = False
+    disable_cuda_graph_padding: bool = False
     disable_disk_cache: bool = False
+    disable_custom_all_reduce: bool = False
+    enable_mixed_chunk: bool = False
     enable_torch_compile: bool = False
     enable_p2p_check: bool = False
     enable_mla: bool = False
-    attention_reduce_in_fp32: bool = False
-    efficient_weight_load: bool = False
+    triton_attention_reduce_in_fp32: bool = False
     # Distributed args
     nccl_init_addr: Optional[str] = None
@@ -190,11 +197,23 @@ class ServerArgs:
             '* "float" is shorthand for FP32 precision.\n'
             '* "float32" for FP32 precision.',
         )
+        parser.add_argument(
+            "--kv-cache-dtype",
+            type=str,
+            default=ServerArgs.kv_cache_dtype,
+            choices=["auto", "fp8_e5m2"],
+            help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" is supported for CUDA 11.8+.',
+        )
         parser.add_argument(
             "--trust-remote-code",
             action="store_true",
             help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
         )
+        parser.add_argument(
+            "--is-embedding",
+            action="store_true",
+            help="Whether to use a CausalLM as an embedding model.",
+        )
         parser.add_argument(
             "--context-length",
             type=int,
@@ -388,11 +407,27 @@ class ServerArgs:
             action="store_true",
             help="Disable cuda graph.",
         )
+        parser.add_argument(
+            "--disable-cuda-graph-padding",
+            action="store_true",
+            help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
+        )
         parser.add_argument(
             "--disable-disk-cache",
             action="store_true",
             help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
         )
+        parser.add_argument(
+            "--disable-custom-all-reduce",
+            action="store_true",
+            default=False,
+            help="Disable the custom all-reduce kernel and fall back to NCCL.",
+        )
+        parser.add_argument(
+            "--enable-mixed-chunk",
+            action="store_true",
+            help="Enabling mixing prefill and decode in a batch when using chunked prefill.",
+        )
         parser.add_argument(
             "--enable-torch-compile",
             action="store_true",
@@ -406,13 +441,13 @@ class ServerArgs:
         parser.add_argument(
             "--enable-mla",
             action="store_true",
-            help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2",
+            help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
         )
         parser.add_argument(
-            "--attention-reduce-in-fp32",
+            "--triton-attention-reduce-in-fp32",
             action="store_true",
             help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
-            "This only affects Triton attention kernels",
+            "This only affects Triton attention kernels.",
         )
         parser.add_argument(
             "--efficient-weight-load",
@@ -430,15 +465,6 @@ class ServerArgs:
     def url(self):
         return f"http://{self.host}:{self.port}"
-    def print_mode_args(self):
-        return (
-            f"disable_flashinfer={self.disable_flashinfer}, "
-            f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}, "
-            f"disable_radix_cache={self.disable_radix_cache}, "
-            f"disable_regex_jump_forward={self.disable_regex_jump_forward}, "
-            f"disable_disk_cache={self.disable_disk_cache}, "
-        )
     def check_server_args(self):
         assert (
             self.tp_size % self.nnodes == 0
@@ -446,6 +472,14 @@ class ServerArgs:
         assert not (
             self.dp_size > 1 and self.node_rank is not None
         ), "multi-node data parallel is not supported"
+        if "Alibaba-NLP/gte-Qwen2-1.5B-instruct" == self.model_path:
+            logger.info(
+                "Not sure why, the tokenizer will add an additional token at the end of the prompt when trust_remote_mode=True"
+            )
+            self.trust_remote_code = False
+        if "gemma-2" in self.model_path.lower():
+            logger.info("When using sliding window in gemma-2, turn on flashinfer.")
+            self.disable_flashinfer = False
 @dataclasses.dataclass

sglang/srt/utils.py CHANGED Viewed

@@ -35,7 +35,6 @@ import torch
 import torch.distributed as dist
 from fastapi.responses import JSONResponse
 from packaging import version as pkg_version
-from starlette.middleware.base import BaseHTTPMiddleware
 from torch.nn.parameter import Parameter
 from triton.runtime.cache import (
     FileCacheManager,
@@ -225,13 +224,18 @@ def is_multimodal_model(model):
     raise ValueError("unrecognized type")
-def is_generation_model(model_architectures):
+def is_generation_model(model_architectures, is_embedding: bool = False):
+    # We have two ways to determine whether a model is a generative model.
+    # 1. Check the model architectue
+    # 2. check the `is_embedding` server args
     if (
         "LlamaEmbeddingModel" in model_architectures
         or "MistralModel" in model_architectures
     ):
         return False
-    return True
+    else:
+        return not is_embedding
 def decode_video_base64(video_base64):
@@ -348,7 +352,7 @@ def suppress_other_loggers():
         logging.WARN
     )
     logging.getLogger("vllm.selector").setLevel(logging.WARN)
-    logging.getLogger("vllm.utils").setLevel(logging.WARN)
+    logging.getLogger("vllm.utils").setLevel(logging.ERROR)
 def assert_pkg_version(pkg: str, min_version: str, message: str):
@@ -370,14 +374,11 @@ def kill_parent_process():
     """Kill the parent process and all children of the parent process."""
     current_process = psutil.Process()
     parent_process = current_process.parent()
-    children = parent_process.children(recursive=True)
-    for child in children:
-        if child.pid != current_process.pid:
-            os.kill(child.pid, 9)
-    os.kill(parent_process.pid, 9)
+    kill_child_process(parent_process.pid, skip_pid=current_process.pid)
-def kill_child_process(pid, including_parent=True):
+def kill_child_process(pid, including_parent=True, skip_pid=None):
+    """Kill the process and all its children process."""
     try:
         parent = psutil.Process(pid)
     except psutil.NoSuchProcess:
@@ -385,6 +386,8 @@ def kill_child_process(pid, including_parent=True):
     children = parent.children(recursive=True)
     for child in children:
+        if child.pid == skip_pid:
+            continue
         try:
             child.kill()
         except psutil.NoSuchProcess:
@@ -453,10 +456,6 @@ def monkey_patch_vllm_dummy_weight_loader():
                 quant_method = getattr(module, "quant_method", None)
                 if quant_method is not None:
                     quant_method.process_weights_after_loading(module)
-                # FIXME: Remove this after Mixtral is updated
-                # to use quant_method.
-                if hasattr(module, "process_weights_after_loading"):
-                    module.process_weights_after_loading()
             # NOTE(woosuk): For accurate performance evaluation, we assign
             # random values to the weights.
@@ -644,7 +643,7 @@ def set_ulimit(target_soft_limit=65535):
             logger.warn(f"Fail to set RLIMIT_NOFILE: {e}")
-def is_llama3_405b_fp8(model_config):
+def is_llama3_405b_fp8_head_16(model_config):
     """Return whether the model is meta-llama/Meta-Llama-3.1-405B-FP8 with 16 kv heads."""
     if (
         model_config.hf_config.architectures[0] == "LlamaForCausalLM"
@@ -693,7 +692,7 @@ def monkey_patch_vllm_qvk_linear_loader():
     setattr(QKVParallelLinear, "weight_loader", weight_loader_srt)
-def add_api_key_middleware(app, api_key):
+def add_api_key_middleware(app, api_key: str):
     @app.middleware("http")
     async def authentication(request, call_next):
         if request.method == "OPTIONS":
@@ -705,7 +704,7 @@ def add_api_key_middleware(app, api_key):
         return await call_next(request)
-def prepare_model(model_path):
+def prepare_model(model_path: str):
     if "SGLANG_USE_MODELSCOPE" in os.environ:
         if not os.path.exists(model_path):
             from modelscope import snapshot_download
@@ -714,7 +713,7 @@ def prepare_model(model_path):
     return model_path
-def prepare_tokenizer(tokenizer_path):
+def prepare_tokenizer(tokenizer_path: str):
     if "SGLANG_USE_MODELSCOPE" in os.environ:
         if not os.path.exists(tokenizer_path):
             from modelscope import snapshot_download
@@ -723,3 +722,13 @@ def prepare_tokenizer(tokenizer_path):
                 tokenizer_path, ignore_patterns=["*.bin", "*.safetensors"]
             )
     return tokenizer_path
+def configure_logger(server_args, prefix: str = ""):
+    format = f"[%(asctime)s{prefix}] %(message)s"
+    logging.basicConfig(
+        level=getattr(logging, server_args.log_level.upper()),
+        format=format,
+        datefmt="%H:%M:%S",
+        force=True,
+    )

sglang/test/runners.py CHANGED Viewed

@@ -14,7 +14,8 @@ limitations under the License.
 """
 import json
-import multiprocessing
+import multiprocessing as mp
+import os
 from dataclasses import dataclass
 from typing import List, Union
@@ -23,16 +24,22 @@ import torch.nn.functional as F
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from sglang.srt.server import Runtime
-from sglang.srt.utils import is_generation_model
+from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER
 DEFAULT_PROMPTS = [
     # the output of gemma-2-2b from SRT is unstable on the commented prompt
     # "The capital of France is",
+    "Apple is red. Banana is Yellow. " * 800 + "Apple is",
     "The capital of the United Kindom is",
     "Today is a sunny day and I like",
     "AI is a field of computer science focused on",
 ]
+dirpath = os.path.dirname(__file__)
+with open(os.path.join(dirpath, "long_prompt.txt"), "r") as f:
+    long_prompt = f.read()
+DEFAULT_PROMPTS.append(long_prompt)
 NUM_TOP_LOGPROBS = 5
@@ -56,44 +63,37 @@ class HFRunner:
     def __init__(
         self,
         model_path,
-        torch_dtype=torch.float16,
-        is_generation_model=None,
+        torch_dtype,
+        is_generation,
     ):
-        self.in_queue = multiprocessing.Queue()
-        self.out_queue = multiprocessing.Queue()
+        self.is_generation = is_generation
+        self.in_queue = mp.Queue()
+        self.out_queue = mp.Queue()
-        self.model_proc = multiprocessing.Process(
+        self.model_proc = mp.Process(
             target=self.start_model_process,
             args=(
                 self.in_queue,
                 self.out_queue,
                 model_path,
                 torch_dtype,
-                is_generation_model,
             ),
         )
         self.model_proc.start()
-    def start_model_process(
-        self, in_queue, out_queue, model_path, torch_dtype, is_generation_model
-    ):
+    def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_path,
             torch_dtype=torch_dtype,
-            trust_remote_code=True,
         )
-        self.is_generation_model = (
-            is_generation_model(model_path)
-            if is_generation_model is None
-            else is_generation_model
-        )
-        if self.is_generation_model:
+        if self.is_generation:
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_path,
                 torch_dtype=torch_dtype,
+                trust_remote_code=False,
                 low_cpu_mem_usage=True,
-                trust_remote_code=True,
             ).cuda()
         else:
             from sentence_transformers import SentenceTransformer
@@ -106,7 +106,7 @@ class HFRunner:
         while True:
             prompts, max_new_tokens = in_queue.get()
             if prompts is not None:
-                if self.is_generation_model:
+                if self.is_generation:
                     output_strs = []
                     prefill_logprobs = []
                     for p in prompts:
@@ -125,16 +125,14 @@ class HFRunner:
                         )
                         logits = self.model.forward(input_ids).logits[0]
-                        logprobs = F.log_softmax(
-                            logits, dim=-1, dtype=torch.float32
-                        ).tolist()
-                        # index_of_max = (lambda nums: nums.index(max(nums)))(logprobs[-1])
-                        # print("index", index_of_max)
-                        logprobs = [
-                            sorted(token_logprobs, reverse=True)[:NUM_TOP_LOGPROBS]
-                            for token_logprobs in logprobs
-                        ]
-                        prefill_logprobs.append(logprobs)
+                        logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+                        logprobs, top_indices = torch.topk(
+                            logprobs, k=NUM_TOP_LOGPROBS, dim=-1
+                        )
+                        # print("index", top_indices)
+                        prefill_logprobs.append(logprobs.tolist())
+                        del logits
+                        del logprobs
                     out_queue.put(
                         ModelOutput(
@@ -171,19 +169,20 @@ class SRTRunner:
     def __init__(
         self,
         model_path,
+        torch_dtype,
+        is_generation,
         tp_size=1,
-        torch_dtype=torch.float16,
-        is_generation_model=None,
+        port=DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
     ):
-        self.is_generation_model = (
-            is_generation_model(model_path)
-            if is_generation_model is None
-            else is_generation_model
-        )
+        self.is_generation = is_generation
         self.runtime = Runtime(
             model_path=model_path,
             tp_size=tp_size,
             dtype=get_dtype_str(torch_dtype),
+            port=port,
+            mem_fraction_static=0.69,
+            trust_remote_code=False,
+            is_embedding=not self.is_generation,
         )
     def forward(
@@ -191,7 +190,7 @@ class SRTRunner:
         prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
         max_new_tokens=8,
     ):
-        if self.is_generation_model:
+        if self.is_generation:
             # the return value contains logprobs from prefill
             output_strs = []
             top_input_logprobs = []
@@ -201,6 +200,7 @@ class SRTRunner:
                     prompt,
                     sampling_params=sampling_params,
                     return_logprob=True,
+                    logprob_start_len=0,
                     top_logprobs_num=NUM_TOP_LOGPROBS,
                 )
                 response = json.loads(response)

sglang/test/simple_eval_common.py CHANGED Viewed

@@ -1,13 +1,12 @@
 # Adapted from https://github.com/openai/simple-evals/
-import base64
 import os
 import resource
 import time
 from collections import defaultdict
 from dataclasses import dataclass, field
 from multiprocessing.pool import ThreadPool
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 import httpx
 import jinja2
@@ -44,8 +43,8 @@ class EvalResult:
     Result of running an evaluation (usually consisting of many samples)
     """
-    score: float | None  # top-line metric
-    metrics: Dict[str, float] | None  # other metrics
+    score: Optional[float]  # top-line metric
+    metrics: Optional[Dict[str, float]]  # other metrics
     htmls: List[str]  # strings of valid HTML
     convos: List[MessageList]  # sampled conversations
@@ -56,10 +55,10 @@ class SingleEvalResult:
     Result of evaluating a single sample
     """
-    score: float | None
+    score: Optional[float]
     metrics: Dict[str, float] = field(default_factory=dict)
-    html: str | None = None
-    convo: MessageList | None = None  # sampled conversation
+    html: Optional[str] = None
+    convo: Optional[MessageList] = None  # sampled conversation
 class Eval:
@@ -89,8 +88,8 @@ class ChatCompletionSampler(SamplerBase):
     def __init__(
         self,
         base_url: str = None,
-        model: str | None = None,
-        system_message: str | None = None,
+        model: Optional[str] = None,
+        system_message: Optional[str] = None,
         temperature: float = 0.0,
         max_tokens: int = 2048,
     ):
@@ -272,7 +271,7 @@ def _compute_stat(values: list, stat: str):
 def aggregate_results(
     single_eval_results: List[SingleEvalResult],
     default_stats: Tuple[str] = ("mean", "std"),
-    name2stats: Dict[str, Tuple[str]] | None = None,
+    name2stats: Optional[Dict[str, Tuple[str]]] = None,
 ) -> EvalResult:
     """
     Aggregate results from multiple evaluations into a single EvalResult.

sglang/test/simple_eval_gpqa.py CHANGED Viewed

@@ -8,6 +8,7 @@ https://arxiv.org/abs/2311.12022
 import random
 import re
+from typing import Optional
 import pandas
@@ -28,7 +29,7 @@ class GPQAEval(Eval):
     def __init__(
         self,
         filename: str,
-        num_examples: int | None,
+        num_examples: Optional[int],
         num_threads: int,
         n_repeats: int = 1,
     ):

sglang/test/simple_eval_humaneval.py CHANGED Viewed

@@ -9,7 +9,7 @@ https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
 import random
 import re
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Dict, List
+from typing import Dict, List, Optional
 import tqdm
@@ -61,7 +61,7 @@ def evaluate_functional_correctness(
 class HumanEval(Eval):
     def __init__(
         self,
-        num_examples: int | None,
+        num_examples: Optional[int],
         num_threads: int,
         num_samples_per_task: int = 5,
         ks_passes: List[int] = [1, 2, 5],

sglang/test/simple_eval_math.py CHANGED Viewed

@@ -8,6 +8,7 @@ https://arxiv.org/abs/2103.03874
 import random
 import re
+from typing import Optional
 import pandas
@@ -36,7 +37,7 @@ class MathEval(Eval):
         self,
         filename: str,
         equality_checker: SamplerBase,
-        num_examples: int | None,
+        num_examples: Optional[int],
         num_threads: int,
     ):
         df = pandas.read_csv(filename)

sglang/test/simple_eval_mmlu.py CHANGED Viewed

@@ -8,6 +8,7 @@ https://arxiv.org/abs/2009.03300
 import random
 import re
+from typing import Optional
 import pandas
@@ -84,7 +85,7 @@ subject2category = {
 class MMLUEval(Eval):
-    def __init__(self, filename: str, num_examples: int | None, num_threads: int):
+    def __init__(self, filename: str, num_examples: Optional[int], num_threads: int):
         df = pandas.read_csv(filename)
         examples = [row.to_dict() for _, row in df.iterrows()]
         if num_examples:

sglang/test/test_activation.py ADDED Viewed

@@ -0,0 +1,55 @@
+import itertools
+import unittest
+import torch
+from sglang.srt.layers.activation import GeluAndMul
+class TestGeluAndMul(unittest.TestCase):
+    DTYPES = [torch.half, torch.bfloat16]
+    NUM_TOKENS = [7, 83, 2048]
+    D = [512, 4096, 5120, 13824]
+    SEEDS = [0]
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+    def _run_gelu_and_mul_test(self, num_tokens, d, dtype, seed):
+        torch.manual_seed(seed)
+        layer = GeluAndMul().to(dtype=dtype)
+        x = torch.randn(num_tokens, 2 * d, dtype=dtype)
+        with torch.inference_mode():
+            ref_out = layer.forward_native(x)
+            out = layer.forward_cuda(x)
+        if dtype == torch.bfloat16:
+            atol = rtol = 1e-2
+        else:
+            atol = rtol = 1e-3
+        self.assertTrue(torch.allclose(out, ref_out, atol=atol, rtol=rtol))
+    def test_gelu_and_mul(self):
+        for params in itertools.product(
+            self.NUM_TOKENS,
+            self.D,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                num_tokens=params[0],
+                d=params[1],
+                dtype=params[2],
+                seed=params[3],
+            ):
+                self._run_gelu_and_mul_test(*params)
+if __name__ == "__main__":
+    unittest.main(verbosity=2)

sglang/test/test_programs.py CHANGED Viewed

@@ -103,16 +103,19 @@ def test_decode_int():
 def test_decode_json_regex():
     @sgl.function
     def decode_json(s):
-        from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STRING
+        from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STR
         s += "Generate a JSON object to describe the basic city information of Paris.\n"
+        s += "Here are the JSON object:\n"
+        # NOTE: we recommend using dtype gen or whole regex string to control the output
         with s.var_scope("json_output"):
             s += "{\n"
-            s += '  "name": ' + sgl.gen(regex=REGEX_STRING + ",") + "\n"
-            s += '  "population": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
-            s += '  "area": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
-            s += '  "latitude": ' + sgl.gen(regex=REGEX_FLOAT) + "\n"
+            s += '  "name": ' + sgl.gen(regex=REGEX_STR) + ",\n"
+            s += '  "population": ' + sgl.gen(regex=REGEX_INT, stop=[" ", "\n"]) + ",\n"
+            s += '  "area": ' + sgl.gen(regex=REGEX_INT, stop=[" ", "\n"]) + ",\n"
+            s += '  "latitude": ' + sgl.gen(regex=REGEX_FLOAT, stop=[" ", "\n"]) + "\n"
             s += "}"
     ret = decode_json.run(temperature=0.0)
@@ -359,6 +362,30 @@ def test_regex():
     assert re.match(regex, answer)
+def test_dtype_gen():
+    @sgl.function
+    def dtype_gen(s):
+        s += "Q: What is the full name of DNS?\n"
+        s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
+        s += "Q: Which year was DNS invented?\n"
+        s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
+        s += "Q: What is the value of pi?\n"
+        s += "A: " + sgl.gen("float_res", dtype=float) + "\n"
+        s += "Q: Is the sky blue?\n"
+        s += "A: " + sgl.gen("bool_res", dtype=bool) + "\n"
+    state = dtype_gen.run()
+    try:
+        state["int_res"] = int(state["int_res"])
+        state["float_res"] = float(state["float_res"])
+        state["bool_res"] = bool(state["bool_res"])
+        # assert state["str_res"].startswith('"') and state["str_res"].endswith('"')
+    except ValueError:
+        print(state)
+        raise
 def test_completion_speculative():
     @sgl.function(num_api_spec_tokens=64)
     def gen_character_spec(s):

sglang 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl

sglang 0.2.12py3-none-any.whl → 0.2.14py3-none-any.whl