PyPI - sglang - Versions diffs - 0.3.4.post2__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

sglang 0.3.4.post2py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

sglang/api.py +1 -1
sglang/bench_latency.py +3 -3
sglang/bench_server_latency.py +2 -3
sglang/bench_serving.py +92 -0
sglang/global_config.py +9 -3
sglang/lang/chat_template.py +50 -25
sglang/lang/interpreter.py +9 -1
sglang/lang/ir.py +11 -2
sglang/launch_server.py +1 -1
sglang/srt/configs/model_config.py +51 -13
sglang/srt/constrained/__init__.py +18 -0
sglang/srt/constrained/bnf_cache.py +61 -0
sglang/srt/constrained/grammar.py +190 -0
sglang/srt/hf_transformers_utils.py +6 -5
sglang/srt/layers/attention/triton_ops/decode_attention.py +110 -30
sglang/srt/layers/attention/triton_ops/prefill_attention.py +1 -1
sglang/srt/layers/fused_moe/fused_moe.py +4 -3
sglang/srt/layers/fused_moe/layer.py +28 -0
sglang/srt/layers/quantization/base_config.py +16 -1
sglang/srt/layers/vocab_parallel_embedding.py +486 -0
sglang/srt/managers/data_parallel_controller.py +7 -6
sglang/srt/managers/detokenizer_manager.py +9 -11
sglang/srt/managers/image_processor.py +4 -3
sglang/srt/managers/io_struct.py +70 -78
sglang/srt/managers/schedule_batch.py +33 -49
sglang/srt/managers/schedule_policy.py +24 -13
sglang/srt/managers/scheduler.py +137 -80
sglang/srt/managers/tokenizer_manager.py +224 -336
sglang/srt/managers/tp_worker.py +5 -5
sglang/srt/mem_cache/flush_cache.py +1 -1
sglang/srt/model_executor/cuda_graph_runner.py +7 -4
sglang/srt/model_executor/model_runner.py +8 -17
sglang/srt/models/baichuan.py +4 -4
sglang/srt/models/chatglm.py +4 -4
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/dbrx.py +5 -5
sglang/srt/models/deepseek.py +4 -4
sglang/srt/models/deepseek_v2.py +4 -4
sglang/srt/models/exaone.py +4 -4
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +1 -1
sglang/srt/models/gpt2.py +287 -0
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +4 -4
sglang/srt/models/internlm2.py +4 -4
sglang/srt/models/llama.py +15 -7
sglang/srt/models/llama_embedding.py +2 -10
sglang/srt/models/llama_reward.py +5 -0
sglang/srt/models/minicpm.py +4 -4
sglang/srt/models/minicpm3.py +4 -4
sglang/srt/models/mixtral.py +7 -5
sglang/srt/models/mixtral_quant.py +4 -4
sglang/srt/models/mllama.py +5 -5
sglang/srt/models/olmo.py +4 -4
sglang/srt/models/olmoe.py +4 -4
sglang/srt/models/qwen.py +4 -4
sglang/srt/models/qwen2.py +4 -4
sglang/srt/models/qwen2_moe.py +4 -4
sglang/srt/models/qwen2_vl.py +4 -8
sglang/srt/models/stablelm.py +4 -4
sglang/srt/models/torch_native_llama.py +4 -4
sglang/srt/models/xverse.py +4 -4
sglang/srt/models/xverse_moe.py +4 -4
sglang/srt/openai_api/adapter.py +52 -66
sglang/srt/sampling/sampling_batch_info.py +7 -13
sglang/srt/server.py +31 -35
sglang/srt/server_args.py +34 -5
sglang/srt/utils.py +40 -56
sglang/test/runners.py +2 -1
sglang/test/test_utils.py +73 -25
sglang/utils.py +62 -1
sglang/version.py +1 -1
sglang-0.3.5.dist-info/METADATA +344 -0
{sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/RECORD +77 -73
{sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/WHEEL +1 -1
sglang-0.3.4.post2.dist-info/METADATA +0 -899
{sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/LICENSE +0 -0
{sglang-0.3.4.post2.dist-info → sglang-0.3.5.dist-info}/top_level.txt +0 -0

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -71,6 +71,7 @@ from sglang.srt.openai_api.protocol import (
     TopLogprob,
     UsageInfo,
 )
+from sglang.utils import get_exception_traceback
 logger = logging.getLogger(__name__)
@@ -314,6 +315,8 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
                 )
         except Exception as e:
+            logger.error(f"error: {get_exception_traceback()}")
+            responses = []
             error_json = {
                 "id": f"batch_req_{uuid.uuid4()}",
                 "custom_id": request_data.get("custom_id"),
@@ -363,7 +366,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
         }
     except Exception as e:
-        logger.error("error in SGLang:", e)
+        logger.error(f"error: {e}")
         # Update batch status to "failed"
         retrieve_batch = batch_storage[batch_id]
         retrieve_batch.status = "failed"
@@ -469,80 +472,67 @@ async def v1_retrieve_file_content(file_id: str):
 def v1_generate_request(
     all_requests: List[CompletionRequest], request_ids: List[str] = None
 ):
+    if len(all_requests) > 1:
+        first_prompt_type = type(all_requests[0].prompt)
+        for request in all_requests:
+            assert (
+                type(request.prompt) is first_prompt_type
+            ), "All prompts must be of the same type in file input settings"
+            if request.n > 1:
+                raise ValueError(
+                    "Parallel sampling is not supported for completions from files"
+                )
     prompts = []
     sampling_params_list = []
     return_logprobs = []
     logprob_start_lens = []
     top_logprobs_nums = []
-    # NOTE: with openai API, the prompt's logprobs are always not computed
-    first_prompt_type = type(all_requests[0].prompt)
     for request in all_requests:
-        assert (
-            type(request.prompt) is first_prompt_type
-        ), "All prompts must be of the same type in file input settings"
-        if len(all_requests) > 1 and request.n > 1:
-            raise ValueError(
-                "Parallel sampling is not supported for completions from files"
-            )
+        # NOTE: with openai API, the prompt's logprobs are always not computed
         if request.echo and request.logprobs:
             logger.warning(
                 "Echo is not compatible with logprobs. "
-                "To compute logprobs of input prompt, please use SGLang /request API."
+                "To compute logprobs of input prompt, please use the native /generate API."
             )
-    for request in all_requests:
         prompts.append(request.prompt)
+        sampling_params_list.append(
+            {
+                "temperature": request.temperature,
+                "max_new_tokens": request.max_tokens,
+                "min_new_tokens": request.min_tokens,
+                "stop": request.stop,
+                "stop_token_ids": request.stop_token_ids,
+                "top_p": request.top_p,
+                "presence_penalty": request.presence_penalty,
+                "frequency_penalty": request.frequency_penalty,
+                "repetition_penalty": request.repetition_penalty,
+                "regex": request.regex,
+                "json_schema": request.json_schema,
+                "n": request.n,
+                "ignore_eos": request.ignore_eos,
+                "no_stop_trim": request.no_stop_trim,
+            }
+        )
         return_logprobs.append(request.logprobs is not None and request.logprobs > 0)
         logprob_start_lens.append(-1)
         top_logprobs_nums.append(
             request.logprobs if request.logprobs is not None else 0
         )
-        sampling_params = []
-        if isinstance(request.no_stop_trim, list):
-            num_reqs = len(request.prompt)
-        else:
-            num_reqs = 1
-        for i in range(num_reqs):
-            sampling_params.append(
-                {
-                    "temperature": request.temperature,
-                    "max_new_tokens": request.max_tokens,
-                    "min_new_tokens": request.min_tokens,
-                    "stop": request.stop,
-                    "stop_token_ids": request.stop_token_ids,
-                    "top_p": request.top_p,
-                    "presence_penalty": request.presence_penalty,
-                    "frequency_penalty": request.frequency_penalty,
-                    "repetition_penalty": request.repetition_penalty,
-                    "regex": request.regex,
-                    "json_schema": request.json_schema,
-                    "n": request.n,
-                    "ignore_eos": request.ignore_eos,
-                    "no_stop_trim": (
-                        request.no_stop_trim
-                        if not isinstance(request.no_stop_trim, list)
-                        else request.no_stop_trim[i]
-                    ),
-                }
-            )
-        if num_reqs == 1:
-            sampling_params_list.append(sampling_params[0])
-        else:
-            sampling_params_list.append(sampling_params)
     if len(all_requests) == 1:
-        prompt = prompts[0]
+        if isinstance(prompts[0], str) or isinstance(prompts[0][0], str):
+            prompt_kwargs = {"text": prompts[0]}
+        else:
+            prompt_kwargs = {"input_ids": prompts[0]}
         sampling_params_list = sampling_params_list[0]
-        logprob_start_lens = logprob_start_lens[0]
         return_logprobs = return_logprobs[0]
+        logprob_start_lens = logprob_start_lens[0]
         top_logprobs_nums = top_logprobs_nums[0]
-        if isinstance(prompt, str) or isinstance(prompt[0], str):
-            prompt_kwargs = {"text": prompt}
-        else:
-            prompt_kwargs = {"input_ids": prompt}
     else:
-        if isinstance(prompts[0], str):
+        if isinstance(prompts[0], str) or isinstance(prompts[0][0], str):
             prompt_kwargs = {"text": prompts}
         else:
             prompt_kwargs = {"input_ids": prompts}
@@ -558,9 +548,7 @@ def v1_generate_request(
         rid=request_ids,
     )
-    if len(all_requests) == 1:
-        return adapted_request, all_requests[0]
-    return adapted_request, all_requests
+    return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
 def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
@@ -595,7 +583,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
         if isinstance(request, list) and request[idx].echo:
             echo = True
             text = request[idx].prompt + text
-        if (not isinstance(request, list)) and echo:
+        if echo and not isinstance(request, list):
             prompt_index = idx // request.n
             text = prompts[prompt_index] + text
@@ -709,7 +697,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
                 async for content in tokenizer_manager.generate_request(
                     adapted_request, raw_request
                 ):
-                    index = content["index"]
+                    index = content.get("index", 0)
                     stream_buffer = stream_buffers.get(index, "")
                     n_prev_token = n_prev_tokens.get(index, 0)
@@ -945,19 +933,18 @@ def v1_chat_generate_request(
         sampling_params_list.append(sampling_params)
         image_data_list.append(image_data)
-        modalities_list.extend(modalities)
+        modalities_list.append(modalities)
     if len(all_requests) == 1:
-        input_ids = input_ids[0]
-        if isinstance(input_ids, str):
-            prompt_kwargs = {"text": input_ids}
+        if isinstance(input_ids[0], str):
+            prompt_kwargs = {"text": input_ids[0]}
         else:
-            prompt_kwargs = {"input_ids": input_ids}
+            prompt_kwargs = {"input_ids": input_ids[0]}
         sampling_params_list = sampling_params_list[0]
         image_data_list = image_data_list[0]
         return_logprobs = return_logprobs[0]
         logprob_start_lens = logprob_start_lens[0]
         top_logprobs_nums = top_logprobs_nums[0]
-        modalities_list = modalities_list[:1]
+        modalities_list = modalities_list[0]
     else:
         if isinstance(input_ids[0], str):
             prompt_kwargs = {"text": input_ids}
@@ -976,9 +963,8 @@ def v1_chat_generate_request(
         rid=request_ids,
         modalities=modalities_list,
     )
-    if len(all_requests) == 1:
-        return adapted_request, all_requests[0]
-    return adapted_request, all_requests
+    return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
 def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
@@ -1116,7 +1102,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
                 async for content in tokenizer_manager.generate_request(
                     adapted_request, raw_request
                 ):
-                    index = content["index"]
+                    index = content.get("index", 0)
                     is_first = is_firsts.get(index, True)
                     stream_buffer = stream_buffers.get(index, "")

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List, Optional
 import torch
 import sglang.srt.sampling.penaltylib as penaltylib
-from sglang.srt.constrained import RegexGuide
+from sglang.srt.constrained.grammar import Grammar
 if TYPE_CHECKING:
     from sglang.srt.managers.schedule_batch import ScheduleBatch
@@ -29,11 +29,9 @@ class SamplingBatchInfo:
     # Bias Tensors
     vocab_size: int
     logit_bias: torch.Tensor = None
-    vocab_mask: torch.Tensor = None
+    vocab_mask: Optional[torch.Tensor] = None
-    # FSM states
-    regex_fsms: List[RegexGuide] = None
-    regex_fsm_states: List[int] = None
+    grammars: Optional[List[Optional[Grammar]]] = None
     # Penalizer
     penalizer_orchestrator: Optional[penaltylib.BatchedPenalizerOrchestrator] = None
@@ -136,8 +134,7 @@ class SamplingBatchInfo:
                 self.linear_penalties = penalizer.apply(self.linear_penalties)
     def update_regex_vocab_mask(self):
-        has_regex = self.regex_fsms and any(regex_fsm for regex_fsm in self.regex_fsms)
-        if not has_regex:
+        if not self.grammars or not any(grammar for grammar in self.grammars):
             self.vocab_mask = None
             return
@@ -147,12 +144,9 @@ class SamplingBatchInfo:
             dtype=torch.bool,
             device=self.device,
         )
-        for i, regex_fsm in enumerate(self.regex_fsms):
-            if regex_fsm is not None:
-                self.vocab_mask[i].fill_(1)
-                self.vocab_mask[i][
-                    regex_fsm.get_next_instruction(self.regex_fsm_states[i]).tokens
-                ] = 0
+        for i, grammar in enumerate(self.grammars):
+            if grammar is not None:
+                grammar.fill_vocab_mask(self.vocab_mask[i], self.vocab_size)
     def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor):
         if self.penalizer_orchestrator:

sglang/srt/server.py CHANGED Viewed

@@ -53,7 +53,6 @@ from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
 from sglang.srt.managers.io_struct import (
     EmbeddingReqInput,
     GenerateReqInput,
-    RewardReqInput,
     UpdateWeightReqInput,
 )
 from sglang.srt.managers.scheduler import run_scheduler_process
@@ -91,7 +90,7 @@ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 app = FastAPI()
-tokenizer_manager = None
+tokenizer_manager: TokenizerManager = None
 app.add_middleware(
     CORSMiddleware,
@@ -139,7 +138,7 @@ async def get_server_args():
     return dataclasses.asdict(tokenizer_manager.server_args)
-@app.get("/flush_cache")
+@app.post("/flush_cache")
 async def flush_cache():
     """Flush the radix cache."""
     tokenizer_manager.flush_cache()
@@ -177,9 +176,10 @@ async def get_memory_pool_size():
     """Get the memory pool size in number of tokens"""
     try:
         ret = await tokenizer_manager.get_memory_pool_size()
-        return ret.size
+        return ret
     except Exception as e:
-        return JSONResponse(
+        return ORJSONResponse(
             {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
         )
@@ -253,8 +253,8 @@ app.post("/encode")(encode_request)
 app.put("/encode")(encode_request)
-async def judge_request(obj: RewardReqInput, request: Request):
-    """Handle a reward model request."""
+async def judge_request(obj: EmbeddingReqInput, request: Request):
+    """Handle a reward model request. Now the arguments and return values are the same as embedding models."""
     try:
         ret = await tokenizer_manager.generate_request(obj, request).__anext__()
         return ret
@@ -441,7 +441,7 @@ def launch_server(
     # Send a warmup request
     t = threading.Thread(
-        target=_wait_and_warmup, args=(server_args, pipe_finish_writer, os.getpid())
+        target=_wait_and_warmup, args=(server_args, pipe_finish_writer)
     )
     t.start()
@@ -496,7 +496,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     mp.set_start_method("spawn", force=True)
-def _wait_and_warmup(server_args, pipe_finish_writer, pid):
+def _wait_and_warmup(server_args, pipe_finish_writer):
     headers = {}
     url = server_args.url()
     if server_args.api_key:
@@ -519,7 +519,7 @@ def _wait_and_warmup(server_args, pipe_finish_writer, pid):
         if pipe_finish_writer is not None:
             pipe_finish_writer.send(last_traceback)
         logger.error(f"Initialization failed. warmup error: {last_traceback}")
-        kill_child_process(pid, including_parent=False)
+        kill_child_process(include_self=True)
         return
     model_info = res.json()
@@ -551,7 +551,7 @@ def _wait_and_warmup(server_args, pipe_finish_writer, pid):
         if pipe_finish_writer is not None:
             pipe_finish_writer.send(last_traceback)
         logger.error(f"Initialization failed. warmup error: {last_traceback}")
-        kill_child_process(pid, including_parent=False)
+        kill_child_process(include_self=True)
         return
     # logger.info(f"{res.json()=}")
@@ -617,7 +617,7 @@ class Runtime:
     def shutdown(self):
         if self.pid is not None:
-            kill_child_process(self.pid)
+            kill_child_process(self.pid, include_self=True)
             self.pid = None
     def cache_prefix(self, prefix: str):
@@ -696,24 +696,8 @@ class Runtime:
         self,
         prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
     ):
-        if isinstance(prompt, str) or isinstance(prompt[0], str):
-            # embedding
-            json_data = {
-                "text": prompt,
-            }
-            response = requests.post(
-                self.url + "/encode",
-                json=json_data,
-            )
-        else:
-            # reward
-            json_data = {
-                "conv": prompt,
-            }
-            response = requests.post(
-                self.url + "/judge",
-                json=json_data,
-            )
+        json_data = {"text": prompt}
+        response = requests.post(self.url + "/encode", json=json_data)
         return json.dumps(response.json())
     def __del__(self):
@@ -736,24 +720,32 @@ class Engine:
         # before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
         atexit.register(self.shutdown)
+        # runtime server default log level is log
+        # offline engine works in scripts, so we set it to error
+        if 'log_level' not in kwargs:
+            kwargs['log_level'] = 'error'
         server_args = ServerArgs(*args, **kwargs)
         launch_engine(server_args=server_args)
     def generate(
         self,
-        prompt: Union[str, List[str]],
+        # The input prompt. It can be a single prompt or a batch of prompts.
+        prompt: Optional[Union[List[str], str]] = None,
         sampling_params: Optional[Dict] = None,
+        # The token ids for text; one can either specify text or input_ids.
+        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
         return_logprob: Optional[Union[List[bool], bool]] = False,
         logprob_start_len: Optional[Union[List[int], int]] = None,
         top_logprobs_num: Optional[Union[List[int], int]] = None,
         lora_path: Optional[List[Optional[str]]] = None,
         stream: bool = False,
     ):
-        # TODO (ByronHsu): refactor to reduce the duplicated code
         obj = GenerateReqInput(
             text=prompt,
+            input_ids=input_ids,
             sampling_params=sampling_params,
             return_logprob=return_logprob,
             logprob_start_len=logprob_start_len,
@@ -791,8 +783,11 @@ class Engine:
     async def async_generate(
         self,
-        prompt: Union[str, List[str]],
+        # The input prompt. It can be a single prompt or a batch of prompts.
+        prompt: Optional[Union[List[str], str]] = None,
         sampling_params: Optional[Dict] = None,
+        # The token ids for text; one can either specify text or input_ids.
+        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
         return_logprob: Optional[Union[List[bool], bool]] = False,
         logprob_start_len: Optional[Union[List[int], int]] = None,
         top_logprobs_num: Optional[Union[List[int], int]] = None,
@@ -801,6 +796,7 @@ class Engine:
     ):
         obj = GenerateReqInput(
             text=prompt,
+            input_ids=input_ids,
             sampling_params=sampling_params,
             return_logprob=return_logprob,
             logprob_start_len=logprob_start_len,
@@ -834,7 +830,7 @@ class Engine:
             return ret
     def shutdown(self):
-        kill_child_process(os.getpid(), including_parent=False)
+        kill_child_process()
     def get_tokenizer(self):
         global tokenizer_manager

sglang/srt/server_args.py CHANGED Viewed

@@ -63,6 +63,7 @@ class ServerArgs:
     stream_interval: int = 1
     random_seed: Optional[int] = None
     constrained_json_whitespace_pattern: Optional[str] = None
+    decode_log_interval: int = 40
     # Logging
     log_level: str = "info"
@@ -74,6 +75,7 @@ class ServerArgs:
     api_key: Optional[str] = None
     file_storage_pth: str = "SGLang_storage"
     enable_cache_report: bool = False
+    watchdog_timeout: float = 600
     # Data parallelism
     dp_size: int = 1
@@ -102,6 +104,7 @@ class ServerArgs:
     # Kernel backend
     attention_backend: Optional[str] = None
     sampling_backend: Optional[str] = None
+    grammar_backend: Optional[str] = "outlines"
     # Optimization/debug options
     disable_flashinfer: bool = False
@@ -118,7 +121,8 @@ class ServerArgs:
     enable_overlap_schedule: bool = False
     enable_mixed_chunk: bool = False
     enable_torch_compile: bool = False
-    max_torch_compile_bs: int = 32
+    torch_compile_max_bs: int = 32
+    cuda_graph_max_bs: int = 160
     torchao_config: str = ""
     enable_p2p_check: bool = False
     triton_attention_reduce_in_fp32: bool = False
@@ -427,6 +431,18 @@ class ServerArgs:
             action="store_true",
             help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
         )
+        parser.add_argument(
+            "--watchdog-timeout",
+            type=float,
+            default=ServerArgs.watchdog_timeout,
+            help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
+        )
+        parser.add_argument(
+            "--decode-log-interval",
+            type=int,
+            default=ServerArgs.decode_log_interval,
+            help="The log interval of decode batch"
+        )
         # Data parallelism
         parser.add_argument(
@@ -537,6 +553,13 @@ class ServerArgs:
             default=ServerArgs.sampling_backend,
             help="Choose the kernels for sampling layers.",
         )
+        parser.add_argument(
+            "--grammar-backend",
+            type=str,
+            choices=["xgrammar", "outlines"],
+            default=ServerArgs.grammar_backend,
+            help="Choose the backend for constrained decoding.",
+        )
         # Optimization/debug options
         parser.add_argument(
@@ -611,11 +634,17 @@ class ServerArgs:
             help="Optimize the model with torch.compile. Experimental feature.",
         )
         parser.add_argument(
-            "--max-torch-compile-bs",
+            "--torch-compile-max-bs",
             type=int,
-            default=ServerArgs.max_torch_compile_bs,
+            default=ServerArgs.torch_compile_max_bs,
             help="Set the maximum batch size when using torch compile.",
         )
+        parser.add_argument(
+            "--cuda-graph-max-bs",
+            type=int,
+            default=ServerArgs.cuda_graph_max_bs,
+            help="Set the maximum batch size for cuda graph.",
+        )
         parser.add_argument(
             "--torchao-config",
             type=str,
@@ -712,11 +741,11 @@ class PortArgs:
     @staticmethod
     def init_new(server_args) -> "PortArgs":
-        port = server_args.port + 1
+        port = server_args.port + 42
         while True:
             if is_port_available(port):
                 break
-            port += 1
+            port += 42
         return PortArgs(
             tokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,

sglang/srt/utils.py CHANGED Viewed

@@ -35,6 +35,7 @@ import psutil
 import requests
 import torch
 import torch.distributed as dist
+import zmq
 from fastapi.responses import ORJSONResponse
 from packaging import version as pkg_version
 from torch import nn
@@ -203,56 +204,6 @@ def is_port_available(port):
             return False
-def is_multimodal_model(model_architectures):
-    if (
-        "LlavaLlamaForCausalLM" in model_architectures
-        or "LlavaQwenForCausalLM" in model_architectures
-        or "LlavaMistralForCausalLM" in model_architectures
-        or "LlavaVidForCausalLM" in model_architectures
-        or "MllamaForConditionalGeneration" in model_architectures
-        or "Qwen2VLForConditionalGeneration" in model_architectures
-    ):
-        return True
-    else:
-        return False
-def is_attention_free_model(model_architectures):
-    return False
-def model_has_inner_state(model_architectures):
-    return False
-def is_embedding_model(model_architectures):
-    if (
-        "LlamaEmbeddingModel" in model_architectures
-        or "MistralModel" in model_architectures
-        or "LlamaForSequenceClassification" in model_architectures
-        or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
-    ):
-        return True
-    else:
-        return False
-def is_generation_model(model_architectures, is_embedding: bool = False):
-    # We have two ways to determine whether a model is a generative model.
-    # 1. Check the model architectue
-    # 2. check the `is_embedding` server args
-    if (
-        "LlamaEmbeddingModel" in model_architectures
-        or "MistralModel" in model_architectures
-        or "LlamaForSequenceClassification" in model_architectures
-        or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
-    ):
-        return False
-    else:
-        return not is_embedding
 def decode_video_base64(video_base64):
     from PIL import Image
@@ -397,17 +348,26 @@ def kill_parent_process():
     """Kill the parent process and all children of the parent process."""
     current_process = psutil.Process()
     parent_process = current_process.parent()
-    kill_child_process(parent_process.pid, skip_pid=current_process.pid)
+    kill_child_process(
+        parent_process.pid, include_self=True, skip_pid=current_process.pid
+    )
+    try:
+        current_process.kill()
+    except psutil.NoSuchProcess:
+        pass
-def kill_child_process(pid, including_parent=True, skip_pid=None):
+def kill_child_process(pid=None, include_self=False, skip_pid=None):
     """Kill the process and all its children process."""
+    if pid is None:
+        pid = os.getpid()
     try:
-        parent = psutil.Process(pid)
+        itself = psutil.Process(pid)
     except psutil.NoSuchProcess:
         return
-    children = parent.children(recursive=True)
+    children = itself.children(recursive=True)
     for child in children:
         if child.pid == skip_pid:
             continue
@@ -416,9 +376,9 @@ def kill_child_process(pid, including_parent=True, skip_pid=None):
         except psutil.NoSuchProcess:
             pass
-    if including_parent:
+    if include_self:
         try:
-            parent.kill()
+            itself.kill()
         except psutil.NoSuchProcess:
             pass
@@ -720,3 +680,27 @@ def first_rank_print(*args, **kwargs):
         print(*args, **kwargs)
     else:
         pass
+def get_zmq_socket(context: zmq.Context, socket_type: zmq.SocketType, endpoint: str):
+    mem = psutil.virtual_memory()
+    total_mem = mem.total / 1024**3
+    available_mem = mem.available / 1024**3
+    if total_mem > 32 and available_mem > 16:
+        buf_size = int(0.5 * 1024**3)
+    else:
+        buf_size = -1
+    socket = context.socket(socket_type)
+    if socket_type == zmq.PUSH:
+        socket.setsockopt(zmq.SNDHWM, 0)
+        socket.setsockopt(zmq.SNDBUF, buf_size)
+        socket.connect(f"ipc://{endpoint}")
+    elif socket_type == zmq.PULL:
+        socket.setsockopt(zmq.RCVHWM, 0)
+        socket.setsockopt(zmq.RCVBUF, buf_size)
+        socket.bind(f"ipc://{endpoint}")
+    else:
+        raise ValueError(f"Unsupported socket type: {socket_type}")
+    return socket

sglang 0.3.4.post2__py3-none-any.whl → 0.3.5__py3-none-any.whl

sglang 0.3.4.post2py3-none-any.whl → 0.3.5py3-none-any.whl