PyPI - sglang - Versions diffs - 0.2.13__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

sglang 0.2.13py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

sglang/api.py +6 -0
sglang/bench_latency.py +7 -3
sglang/bench_serving.py +50 -26
sglang/check_env.py +15 -0
sglang/lang/chat_template.py +10 -5
sglang/lang/compiler.py +4 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +9 -0
sglang/launch_server.py +8 -1
sglang/srt/conversation.py +50 -1
sglang/srt/hf_transformers_utils.py +22 -23
sglang/srt/layers/activation.py +24 -1
sglang/srt/layers/decode_attention.py +338 -50
sglang/srt/layers/fused_moe/layer.py +2 -2
sglang/srt/layers/layernorm.py +3 -0
sglang/srt/layers/logits_processor.py +60 -23
sglang/srt/layers/radix_attention.py +3 -4
sglang/srt/layers/sampler.py +154 -0
sglang/srt/managers/controller_multi.py +2 -8
sglang/srt/managers/controller_single.py +7 -10
sglang/srt/managers/detokenizer_manager.py +20 -9
sglang/srt/managers/io_struct.py +44 -11
sglang/srt/managers/policy_scheduler.py +5 -2
sglang/srt/managers/schedule_batch.py +52 -167
sglang/srt/managers/tokenizer_manager.py +192 -83
sglang/srt/managers/tp_worker.py +130 -43
sglang/srt/mem_cache/memory_pool.py +82 -8
sglang/srt/mm_utils.py +79 -7
sglang/srt/model_executor/cuda_graph_runner.py +49 -11
sglang/srt/model_executor/forward_batch_info.py +59 -27
sglang/srt/model_executor/model_runner.py +210 -61
sglang/srt/models/chatglm.py +4 -12
sglang/srt/models/commandr.py +5 -1
sglang/srt/models/dbrx.py +5 -1
sglang/srt/models/deepseek.py +5 -1
sglang/srt/models/deepseek_v2.py +5 -1
sglang/srt/models/gemma.py +5 -1
sglang/srt/models/gemma2.py +15 -7
sglang/srt/models/gpt_bigcode.py +5 -1
sglang/srt/models/grok.py +16 -2
sglang/srt/models/internlm2.py +5 -1
sglang/srt/models/llama2.py +7 -3
sglang/srt/models/llama_classification.py +2 -2
sglang/srt/models/llama_embedding.py +4 -0
sglang/srt/models/llava.py +176 -59
sglang/srt/models/minicpm.py +5 -1
sglang/srt/models/mixtral.py +5 -1
sglang/srt/models/mixtral_quant.py +5 -1
sglang/srt/models/qwen.py +5 -2
sglang/srt/models/qwen2.py +13 -3
sglang/srt/models/qwen2_moe.py +5 -14
sglang/srt/models/stablelm.py +5 -1
sglang/srt/openai_api/adapter.py +117 -37
sglang/srt/sampling/sampling_batch_info.py +209 -0
sglang/srt/{sampling_params.py → sampling/sampling_params.py} +18 -0
sglang/srt/server.py +84 -56
sglang/srt/server_args.py +43 -15
sglang/srt/utils.py +26 -16
sglang/test/runners.py +23 -31
sglang/test/simple_eval_common.py +9 -10
sglang/test/simple_eval_gpqa.py +2 -1
sglang/test/simple_eval_humaneval.py +2 -2
sglang/test/simple_eval_math.py +2 -1
sglang/test/simple_eval_mmlu.py +2 -1
sglang/test/test_activation.py +55 -0
sglang/test/test_utils.py +36 -53
sglang/version.py +1 -1
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/METADATA +92 -25
sglang-0.2.14.dist-info/RECORD +114 -0
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/WHEEL +1 -1
sglang/launch_server_llavavid.py +0 -29
sglang-0.2.13.dist-info/RECORD +0 -112
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/LICENSE +0 -0
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/top_level.txt +0 -0

sglang/srt/sampling/sampling_batch_info.py ADDED Viewed

@@ -0,0 +1,209 @@
+from __future__ import annotations
+import dataclasses
+from typing import TYPE_CHECKING, List
+import torch
+import sglang.srt.sampling.penaltylib as penaltylib
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import ScheduleBatch
+@dataclasses.dataclass
+class SamplingBatchInfo:
+    # Basic Info
+    vocab_size: int
+    # Batched sampling params
+    temperatures: torch.Tensor = None
+    top_ps: torch.Tensor = None
+    top_ks: torch.Tensor = None
+    min_ps: torch.Tensor = None
+    # Dispatch in CUDA graph
+    need_min_p_sampling: bool = False
+    # Bias Tensors
+    logit_bias: torch.Tensor = None
+    vocab_mask: torch.Tensor = None
+    # Penalizer
+    penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
+    linear_penalties: torch.Tensor = None
+    scaling_penalties: torch.Tensor = None
+    def has_bias(self):
+        return (
+            self.logit_bias is not None
+            or self.vocab_mask is not None
+            or self.linear_penalties is not None
+            or self.scaling_penalties is not None
+        )
+    @classmethod
+    def dummy_one(cls, max_bs: int, vocab_size: int):
+        ret = cls(vocab_size=vocab_size)
+        ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
+        ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
+        ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
+        ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda")
+        return ret
+    def __getitem__(self, key):
+        if isinstance(key, slice):
+            # NOTE: We do not use cuda graph when there is bias tensors
+            assert not self.has_bias()
+            return SamplingBatchInfo(
+                vocab_size=self.vocab_size,
+                temperatures=self.temperatures[key],
+                top_ps=self.top_ps[key],
+                top_ks=self.top_ks[key],
+                min_ps=self.min_ps[key],
+                need_min_p_sampling=self.need_min_p_sampling,
+            )
+        else:
+            raise NotImplementedError
+    def inplace_assign(self, bs: int, other: SamplingBatchInfo):
+        # NOTE: We do not use cuda graph when there is bias tensors
+        assert not self.has_bias()
+        self.vocab_size = other.vocab_size
+        self.need_min_p_sampling = other.need_min_p_sampling
+        self.temperatures[:bs] = other.temperatures
+        self.top_ps[:bs] = other.top_ps
+        self.top_ks[:bs] = other.top_ks
+        self.min_ps[:bs] = other.min_ps
+    @classmethod
+    def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
+        device = "cuda"
+        reqs = batch.reqs
+        ret = cls(vocab_size=vocab_size)
+        ret.temperatures = torch.tensor(
+            [r.sampling_params.temperature for r in reqs],
+            dtype=torch.float,
+            device=device,
+        ).view(-1, 1)
+        ret.top_ps = torch.tensor(
+            [r.sampling_params.top_p for r in reqs], dtype=torch.float, device=device
+        )
+        ret.top_ks = torch.tensor(
+            [r.sampling_params.top_k for r in reqs], dtype=torch.int, device=device
+        )
+        ret.min_ps = torch.tensor(
+            [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
+        )
+        ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs)
+        # Each penalizers will do nothing if they evaluate themselves as not required by looking at
+        # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
+        # should not add hefty computation overhead other than simple checks.
+        #
+        # While we choose not to even create the class instances if they are not required, this
+        # could add additional complexity to the {ScheduleBatch} class, especially we need to
+        # handle {filter_batch()} and {merge()} cases as well.
+        ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
+            vocab_size=vocab_size,
+            batch=batch,
+            device=device,
+            Penalizers={
+                penaltylib.BatchedFrequencyPenalizer,
+                penaltylib.BatchedMinNewTokensPenalizer,
+                penaltylib.BatchedPresencePenalizer,
+                penaltylib.BatchedRepetitionPenalizer,
+            },
+        )
+        # Handle logit bias but only allocate when needed
+        ret.logit_bias = None
+        ret.update_regex_vocab_mask(batch)
+        return ret
+    def prepare_penalties(self):
+        self.scaling_penalties = None
+        self.linear_penalties = None
+        for penalizer in self.penalizer_orchestrator.penalizers.values():
+            if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer):
+                if penalizer.is_prepared():
+                    self.scaling_penalties = penalizer.cumulated_repetition_penalties
+            else:
+                if penalizer.is_prepared():
+                    if self.linear_penalties is None:
+                        bs = self.penalizer_orchestrator.batch.batch_size()
+                        self.linear_penalties = torch.zeros(
+                            (bs, self.vocab_size),
+                            dtype=torch.float32,
+                            device="cuda",
+                        )
+                    self.linear_penalties = penalizer.apply(self.linear_penalties)
+    def update_regex_vocab_mask(self, batch: ScheduleBatch):
+        bs, reqs = batch.batch_size(), batch.reqs
+        device = "cuda"
+        has_regex = any(req.regex_fsm is not None for req in reqs)
+        # Reset the vocab mask
+        self.vocab_mask = None
+        if has_regex:
+            for i, req in enumerate(reqs):
+                if req.regex_fsm is not None:
+                    if self.vocab_mask is None:
+                        self.vocab_mask = torch.zeros(
+                            bs, self.vocab_size, dtype=torch.bool, device=device
+                        )
+                    self.vocab_mask[i][
+                        req.regex_fsm.get_next_instruction(req.regex_fsm_state).tokens
+                    ] = 1
+    def filter(self, unfinished_indices: List[int], new_indices: torch.Tensor):
+        self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
+        for item in [
+            "temperatures",
+            "top_ps",
+            "top_ks",
+            "min_ps",
+            "logit_bias",
+        ]:
+            self_val = getattr(self, item, None)
+            if self_val is not None:  # logit_bias can be None
+                setattr(self, item, self_val[new_indices])
+    def merge(self, other: "SamplingBatchInfo"):
+        self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
+        for item in [
+            "temperatures",
+            "top_ps",
+            "top_ks",
+            "min_ps",
+        ]:
+            self_val = getattr(self, item, None)
+            other_val = getattr(other, item, None)
+            setattr(self, item, torch.concat([self_val, other_val]))
+        # logit_bias can be None
+        if self.logit_bias is not None or other.logit_bias is not None:
+            vocab_size = (
+                self.logit_bias.shape[1]
+                if self.logit_bias is not None
+                else other.logit_bias.shape[1]
+            )
+            if self.logit_bias is None:
+                self.logit_bias = torch.zeros(
+                    (len(self.reqs), vocab_size), dtype=torch.float32, device="cuda"
+                )
+            if other.logit_bias is None:
+                other.logit_bias = torch.zeros(
+                    (len(other.reqs), vocab_size), dtype=torch.float32, device="cuda"
+                )
+            self.logit_bias = torch.concat([self.logit_bias, other.logit_bias])

sglang/srt/{sampling_params.py → sampling/sampling_params.py} RENAMED Viewed

@@ -30,6 +30,7 @@ class SamplingParams:
         temperature: float = 1.0,
         top_p: float = 1.0,
         top_k: int = -1,
+        min_p: float = 0.0,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repetition_penalty: float = 1.0,
@@ -42,6 +43,7 @@ class SamplingParams:
         self.temperature = temperature
         self.top_p = top_p
         self.top_k = top_k
+        self.min_p = min_p
         self.frequency_penalty = frequency_penalty
         self.presence_penalty = presence_penalty
         self.repetition_penalty = repetition_penalty
@@ -69,6 +71,8 @@ class SamplingParams:
             )
         if not 0.0 < self.top_p <= 1.0:
             raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
+        if not 0.0 <= self.min_p <= 1.0:
+            raise ValueError(f"min_p must be in [0, 1], got {self.min_p}.")
         if self.top_k < -1 or self.top_k == 0:
             raise ValueError(
                 f"top_k must be -1 (disable), or at least 1, " f"got {self.top_k}."
@@ -123,3 +127,17 @@ class SamplingParams:
                 else:
                     stop_str_max_len = max(stop_str_max_len, len(stop_str))
             self.stop_str_max_len = stop_str_max_len
+    def to_srt_kwargs(self):
+        return {
+            "max_new_tokens": self.max_new_tokens,
+            "stop": self.stop_strs,
+            "stop_token_ids": list(self.stop_token_ids),
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "top_k": self.top_k,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty,
+            "ignore_eos": self.ignore_eos,
+            "regex": self.regex,
+        }

sglang/srt/server.py CHANGED Viewed

@@ -24,7 +24,6 @@ import json
 import logging
 import multiprocessing as mp
 import os
-import sys
 import threading
 import time
 from http import HTTPStatus
@@ -34,7 +33,6 @@ from typing import Dict, List, Optional, Union
 setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
 import aiohttp
-import psutil
 import requests
 import uvicorn
 import uvloop
@@ -52,7 +50,11 @@ from sglang.srt.managers.controller_single import (
     start_controller_process as start_controller_process_single,
 )
 from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
-from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
+from sglang.srt.managers.io_struct import (
+    EmbeddingReqInput,
+    GenerateReqInput,
+    UpdateWeightReqInput,
+)
 from sglang.srt.managers.tokenizer_manager import TokenizerManager
 from sglang.srt.openai_api.adapter import (
     load_chat_template_for_openai_api,
@@ -72,6 +74,7 @@ from sglang.srt.utils import (
     add_api_key_middleware,
     allocate_init_ports,
     assert_pkg_version,
+    configure_logger,
     enable_show_time_cost,
     kill_child_process,
     maybe_set_triton_cache_manager,
@@ -92,10 +95,25 @@ tokenizer_manager = None
 @app.get("/health")
 async def health() -> Response:
-    """Health check."""
+    """Check the health of the http server."""
     return Response(status_code=200)
+@app.get("/health_generate")
+async def health_generate(request: Request) -> Response:
+    """Check the health of the inference server by generating one token."""
+    gri = GenerateReqInput(
+        text="s", sampling_params={"max_new_tokens": 1, "temperature": 0.7}
+    )
+    try:
+        async for _ in tokenizer_manager.generate_request(gri, request):
+            break
+        return Response(status_code=200)
+    except Exception as e:
+        logger.exception(e)
+        return Response(status_code=503)
 @app.get("/get_model_info")
 async def get_model_info():
     result = {
@@ -120,6 +138,23 @@ async def flush_cache():
     )
+@app.post("/update_weights")
+async def update_weights(obj: UpdateWeightReqInput, request: Request):
+    success, message = await tokenizer_manager.update_weights(obj, request)
+    content = {"message": message, "success": str(success)}
+    if success:
+        return JSONResponse(
+            content,
+            status_code=HTTPStatus.OK,
+        )
+    else:
+        return JSONResponse(
+            content,
+            status_code=HTTPStatus.BAD_REQUEST,
+        )
 async def generate_request(obj: GenerateReqInput, request: Request):
     """Handle a generate request."""
     if obj.stream:
@@ -236,15 +271,12 @@ def launch_server(
     """Launch an HTTP server."""
     global tokenizer_manager
-    logging.basicConfig(
-        level=getattr(logging, server_args.log_level.upper()),
-        format="%(message)s",
-    )
+    configure_logger(server_args)
     server_args.check_server_args()
     _set_envs_and_config(server_args)
-    # Allocate ports
+    # Allocate ports for inter-process communications
     server_args.port, server_args.additional_ports = allocate_init_ports(
         server_args.port,
         server_args.additional_ports,
@@ -264,27 +296,29 @@ def launch_server(
     server_args.tokenizer_path = prepare_tokenizer(server_args.tokenizer_path)
     # Launch processes for multi-node tensor parallelism
-    if server_args.nnodes > 1:
-        if server_args.node_rank != 0:
-            tp_size_local = server_args.tp_size // server_args.nnodes
-            gpu_ids = [
-                i for _ in range(server_args.nnodes) for i in range(tp_size_local)
-            ]
-            tp_rank_range = list(
-                range(
-                    server_args.node_rank * tp_size_local,
-                    (server_args.node_rank + 1) * tp_size_local,
-                )
-            )
-            procs = launch_tp_servers(
-                gpu_ids,
-                tp_rank_range,
-                server_args,
-                ports[3],
-                model_overide_args,
+    if server_args.nnodes > 1 and server_args.node_rank != 0:
+        tp_size_local = server_args.tp_size // server_args.nnodes
+        gpu_ids = [i for _ in range(server_args.nnodes) for i in range(tp_size_local)]
+        tp_rank_range = list(
+            range(
+                server_args.node_rank * tp_size_local,
+                (server_args.node_rank + 1) * tp_size_local,
             )
-            while True:
-                pass
+        )
+        procs = launch_tp_servers(
+            gpu_ids,
+            tp_rank_range,
+            server_args,
+            ports[3],
+            model_overide_args,
+        )
+        try:
+            for p in procs:
+                p.join()
+        finally:
+            kill_child_process(os.getpid(), including_parent=False)
+            return
     # Launch processes
     tokenizer_manager = TokenizerManager(server_args, port_args, model_overide_args)
@@ -297,11 +331,13 @@ def launch_server(
         start_process = start_controller_process_single
     else:
         start_process = start_controller_process_multi
     proc_controller = mp.Process(
         target=start_process,
         args=(server_args, port_args, pipe_controller_writer, model_overide_args),
     )
     proc_controller.start()
     proc_detoken = mp.Process(
         target=start_detokenizer_process,
         args=(
@@ -319,15 +355,11 @@ def launch_server(
     if controller_init_state != "init ok" or detoken_init_state != "init ok":
         proc_controller.kill()
         proc_detoken.kill()
-        print(
-            f"Initialization failed. controller_init_state: {controller_init_state}",
-            flush=True,
-        )
-        print(
-            f"Initialization failed. detoken_init_state: {detoken_init_state}",
-            flush=True,
+        raise RuntimeError(
+            "Initialization failed. "
+            f"controller_init_state: {controller_init_state}, "
+            f"detoken_init_state: {detoken_init_state}"
         )
-        sys.exit(1)
     assert proc_controller.is_alive() and proc_detoken.is_alive()
     # Add api key authorization
@@ -336,12 +368,12 @@ def launch_server(
     # Send a warmup request
     t = threading.Thread(
-        target=_wait_and_warmup, args=(server_args, pipe_finish_writer)
+        target=_wait_and_warmup, args=(server_args, pipe_finish_writer, os.getpid())
     )
     t.start()
-    # Listen for requests
     try:
+        # Listen for requests
         uvicorn.run(
             app,
             host=server_args.host,
@@ -389,7 +421,7 @@ def _set_envs_and_config(server_args: ServerArgs):
         )
-def _wait_and_warmup(server_args, pipe_finish_writer):
+def _wait_and_warmup(server_args, pipe_finish_writer, pid):
     headers = {}
     url = server_args.url()
     if server_args.api_key:
@@ -412,8 +444,9 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
     if not success:
         if pipe_finish_writer is not None:
             pipe_finish_writer.send(last_traceback)
-        print(f"Initialization failed. warmup error: {last_traceback}", flush=True)
-        sys.exit(1)
+        logger.error(f"Initialization failed. warmup error: {last_traceback}")
+        kill_child_process(pid, including_parent=False)
+        return
     # Send a warmup request
     request_name = "/generate" if model_info["is_generation"] else "/encode"
@@ -438,21 +471,13 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
                 timeout=600,
             )
             assert res.status_code == 200, f"{res}"
-    except Exception as e:
+    except Exception:
         last_traceback = get_exception_traceback()
         if pipe_finish_writer is not None:
             pipe_finish_writer.send(last_traceback)
-        print(f"Initialization failed. warmup error: {last_traceback}", flush=True)
-        sys.exit(1)
-    # Print warnings here
-    if server_args.disable_radix_cache and server_args.chunked_prefill_size is not None:
-        logger.warning(
-            "You set both `--disable-radix-cache` and `--chunked-prefill-size`. "
-            "This combination is an experimental feature and we noticed it can lead to "
-            "wrong generation results. If you want to use chunked prefill, it is recommended "
-            "not using `--disable-radix-cache`."
-        )
+        logger.error(f"Initialization failed. warmup error: {last_traceback}")
+        kill_child_process(pid, including_parent=False)
+        return
     logger.info("The server is fired up and ready to roll!")
     if pipe_finish_writer is not None:
@@ -490,6 +515,7 @@ class Runtime:
         self.pid = None
         pipe_reader, pipe_writer = mp.Pipe(duplex=False)
         proc = mp.Process(
             target=launch_server,
             args=(self.server_args, model_overide_args, pipe_writer),
@@ -566,15 +592,17 @@ class Runtime:
     def generate(
         self,
-        prompt: str,
+        prompt: Union[str, List[str]],
         sampling_params: Optional[Dict] = None,
         return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
         top_logprobs_num: Optional[Union[List[int], int]] = None,
     ):
         json_data = {
             "text": prompt,
             "sampling_params": sampling_params,
             "return_logprob": return_logprob,
+            "logprob_start_len": logprob_start_len,
             "top_logprobs_num": top_logprobs_num,
         }
         response = requests.post(
@@ -585,7 +613,7 @@ class Runtime:
     def encode(
         self,
-        prompt: str,
+        prompt: Union[str, List[str]],
     ):
         json_data = {
             "text": prompt,

sglang/srt/server_args.py CHANGED Viewed

@@ -33,11 +33,13 @@ class ServerArgs:
     skip_tokenizer_init: bool = False
     load_format: str = "auto"
     dtype: str = "auto"
+    kv_cache_dtype: str = "auto"
     trust_remote_code: bool = True
     context_length: Optional[int] = None
     quantization: Optional[str] = None
     served_model_name: Optional[str] = None
     chat_template: Optional[str] = None
+    is_embedding: bool = False
     # Port
     host: str = "127.0.0.1"
@@ -79,12 +81,14 @@ class ServerArgs:
     disable_radix_cache: bool = False
     disable_regex_jump_forward: bool = False
     disable_cuda_graph: bool = False
+    disable_cuda_graph_padding: bool = False
     disable_disk_cache: bool = False
+    disable_custom_all_reduce: bool = False
+    enable_mixed_chunk: bool = False
     enable_torch_compile: bool = False
     enable_p2p_check: bool = False
     enable_mla: bool = False
-    attention_reduce_in_fp32: bool = False
-    efficient_weight_load: bool = False
+    triton_attention_reduce_in_fp32: bool = False
     # Distributed args
     nccl_init_addr: Optional[str] = None
@@ -193,11 +197,23 @@ class ServerArgs:
             '* "float" is shorthand for FP32 precision.\n'
             '* "float32" for FP32 precision.',
         )
+        parser.add_argument(
+            "--kv-cache-dtype",
+            type=str,
+            default=ServerArgs.kv_cache_dtype,
+            choices=["auto", "fp8_e5m2"],
+            help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" is supported for CUDA 11.8+.',
+        )
         parser.add_argument(
             "--trust-remote-code",
             action="store_true",
             help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
         )
+        parser.add_argument(
+            "--is-embedding",
+            action="store_true",
+            help="Whether to use a CausalLM as an embedding model.",
+        )
         parser.add_argument(
             "--context-length",
             type=int,
@@ -391,11 +407,27 @@ class ServerArgs:
             action="store_true",
             help="Disable cuda graph.",
         )
+        parser.add_argument(
+            "--disable-cuda-graph-padding",
+            action="store_true",
+            help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
+        )
         parser.add_argument(
             "--disable-disk-cache",
             action="store_true",
             help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
         )
+        parser.add_argument(
+            "--disable-custom-all-reduce",
+            action="store_true",
+            default=False,
+            help="Disable the custom all-reduce kernel and fall back to NCCL.",
+        )
+        parser.add_argument(
+            "--enable-mixed-chunk",
+            action="store_true",
+            help="Enabling mixing prefill and decode in a batch when using chunked prefill.",
+        )
         parser.add_argument(
             "--enable-torch-compile",
             action="store_true",
@@ -409,13 +441,13 @@ class ServerArgs:
         parser.add_argument(
             "--enable-mla",
             action="store_true",
-            help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2",
+            help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
         )
         parser.add_argument(
-            "--attention-reduce-in-fp32",
+            "--triton-attention-reduce-in-fp32",
             action="store_true",
             help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
-            "This only affects Triton attention kernels",
+            "This only affects Triton attention kernels.",
         )
         parser.add_argument(
             "--efficient-weight-load",
@@ -433,15 +465,6 @@ class ServerArgs:
     def url(self):
         return f"http://{self.host}:{self.port}"
-    def print_mode_args(self):
-        return (
-            f"disable_flashinfer={self.disable_flashinfer}, "
-            f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}, "
-            f"disable_radix_cache={self.disable_radix_cache}, "
-            f"disable_regex_jump_forward={self.disable_regex_jump_forward}, "
-            f"disable_disk_cache={self.disable_disk_cache}, "
-        )
     def check_server_args(self):
         assert (
             self.tp_size % self.nnodes == 0
@@ -449,8 +472,13 @@ class ServerArgs:
         assert not (
             self.dp_size > 1 and self.node_rank is not None
         ), "multi-node data parallel is not supported"
+        if "Alibaba-NLP/gte-Qwen2-1.5B-instruct" == self.model_path:
+            logger.info(
+                "Not sure why, the tokenizer will add an additional token at the end of the prompt when trust_remote_mode=True"
+            )
+            self.trust_remote_code = False
         if "gemma-2" in self.model_path.lower():
-            logger.info(f"When using sliding window in gemma-2, turn on flashinfer.")
+            logger.info("When using sliding window in gemma-2, turn on flashinfer.")
             self.disable_flashinfer = False

sglang 0.2.13__py3-none-any.whl → 0.2.14__py3-none-any.whl

sglang 0.2.13py3-none-any.whl → 0.2.14py3-none-any.whl