PyPI - sglang - Versions diffs - 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl - Mend

sglang 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

sglang/__init__.py +8 -0
sglang/api.py +10 -2
sglang/bench_latency.py +151 -40
sglang/bench_serving.py +46 -22
sglang/check_env.py +24 -2
sglang/global_config.py +0 -1
sglang/lang/backend/base_backend.py +3 -1
sglang/lang/backend/openai.py +8 -3
sglang/lang/backend/runtime_endpoint.py +46 -29
sglang/lang/choices.py +164 -0
sglang/lang/compiler.py +2 -2
sglang/lang/interpreter.py +6 -13
sglang/lang/ir.py +14 -5
sglang/srt/constrained/base_tool_cache.py +1 -1
sglang/srt/constrained/fsm_cache.py +12 -2
sglang/srt/layers/activation.py +33 -0
sglang/srt/layers/{token_attention.py → decode_attention.py} +9 -5
sglang/srt/layers/extend_attention.py +6 -1
sglang/srt/layers/layernorm.py +65 -0
sglang/srt/layers/logits_processor.py +6 -1
sglang/srt/layers/pooler.py +50 -0
sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} +5 -0
sglang/srt/layers/radix_attention.py +4 -7
sglang/srt/managers/detokenizer_manager.py +31 -9
sglang/srt/managers/io_struct.py +63 -0
sglang/srt/managers/policy_scheduler.py +173 -25
sglang/srt/managers/schedule_batch.py +174 -380
sglang/srt/managers/tokenizer_manager.py +197 -112
sglang/srt/managers/tp_worker.py +299 -364
sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} +9 -4
sglang/srt/mem_cache/chunk_cache.py +43 -20
sglang/srt/mem_cache/memory_pool.py +10 -15
sglang/srt/mem_cache/radix_cache.py +74 -40
sglang/srt/model_executor/cuda_graph_runner.py +27 -12
sglang/srt/model_executor/forward_batch_info.py +319 -0
sglang/srt/model_executor/model_runner.py +30 -47
sglang/srt/models/chatglm.py +1 -1
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_v2.py +1 -1
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +1 -2
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +1 -1
sglang/srt/models/internlm2.py +3 -8
sglang/srt/models/llama2.py +5 -5
sglang/srt/models/llama_classification.py +1 -1
sglang/srt/models/llama_embedding.py +88 -0
sglang/srt/models/llava.py +1 -2
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +1 -1
sglang/srt/models/mixtral.py +1 -1
sglang/srt/models/mixtral_quant.py +1 -1
sglang/srt/models/qwen.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +1 -12
sglang/srt/models/stablelm.py +1 -1
sglang/srt/openai_api/adapter.py +189 -39
sglang/srt/openai_api/protocol.py +43 -1
sglang/srt/sampling/penaltylib/__init__.py +13 -0
sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
sglang/srt/sampling_params.py +31 -4
sglang/srt/server.py +93 -21
sglang/srt/server_args.py +30 -19
sglang/srt/utils.py +31 -13
sglang/test/run_eval.py +10 -1
sglang/test/runners.py +63 -63
sglang/test/simple_eval_humaneval.py +2 -8
sglang/test/simple_eval_mgsm.py +203 -0
sglang/test/srt/sampling/penaltylib/utils.py +337 -0
sglang/test/test_layernorm.py +60 -0
sglang/test/test_programs.py +4 -2
sglang/test/test_utils.py +21 -3
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/METADATA +50 -31
sglang-0.2.12.dist-info/RECORD +112 -0
sglang/srt/layers/linear.py +0 -884
sglang/srt/layers/quantization/__init__.py +0 -64
sglang/srt/layers/quantization/fp8.py +0 -677
sglang-0.2.10.dist-info/RECORD +0 -100
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/LICENSE +0 -0
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/WHEEL +0 -0
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/top_level.txt +0 -0

sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py ADDED Viewed

@@ -0,0 +1,79 @@
+import typing
+import torch
+from ..orchestrator import _BatchedPenalizer, _TokenIDs
+class BatchedPresencePenalizer(_BatchedPenalizer):
+    """
+    Presence penalizer penalizes tokens based on their presence in the output.
+    """
+    presence_penalties: torch.Tensor = None
+    cumulated_presence_penalties: torch.Tensor = None
+    def _is_required(self) -> bool:
+        return any(
+            req.sampling_params.presence_penalty != 0.0
+            for req in self.orchestrator.reqs()
+        )
+    def _prepare(self):
+        self.cumulated_presence_penalties = (
+            torch.tensor(
+                data=[0.0 for _ in self.orchestrator.reqs()],
+                dtype=torch.float32,
+                device=self.orchestrator.device,
+            )
+            .unsqueeze_(1)
+            .repeat(1, self.orchestrator.vocab_size)
+        )
+        self.presence_penalties = (
+            torch.tensor(
+                data=[
+                    req.sampling_params.presence_penalty
+                    for req in self.orchestrator.reqs()
+                ],
+                dtype=torch.float32,
+                device=self.orchestrator.device,
+            )
+            .unsqueeze_(1)
+            .expand_as(self.cumulated_presence_penalties)
+        )
+    def _teardown(self):
+        del self.presence_penalties
+        del self.cumulated_presence_penalties
+        self.presence_penalties = None
+        self.cumulated_presence_penalties = None
+    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
+        pass
+    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
+        mask = output_ids.occurrence_count() > 0
+        self.cumulated_presence_penalties[mask] = self.presence_penalties[mask]
+    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
+        logits -= self.cumulated_presence_penalties
+        return logits
+    def _filter(
+        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
+    ):
+        self.presence_penalties = self.presence_penalties[indices_tensor_to_keep]
+        self.cumulated_presence_penalties = self.cumulated_presence_penalties[
+            indices_tensor_to_keep
+        ]
+    def _merge(self, their: "BatchedPresencePenalizer"):
+        self.presence_penalties = torch.cat(
+            [self.presence_penalties, their.presence_penalties], dim=0
+        )
+        self.cumulated_presence_penalties = torch.cat(
+            [self.cumulated_presence_penalties, their.cumulated_presence_penalties],
+            dim=0,
+        )

sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py ADDED Viewed

@@ -0,0 +1,83 @@
+import typing
+import torch
+from ..orchestrator import _BatchedPenalizer, _TokenIDs
+class BatchedRepetitionPenalizer(_BatchedPenalizer):
+    """
+    Repetition penalizer penalizes tokens based on their repetition in the input and output.
+    """
+    repetition_penalties: torch.Tensor = None
+    cumulated_repetition_penalties: torch.Tensor = None
+    def _is_required(self) -> bool:
+        return any(
+            req.sampling_params.repetition_penalty != 1.0
+            for req in self.orchestrator.reqs()
+        )
+    def _prepare(self):
+        self.cumulated_repetition_penalties = (
+            torch.tensor(
+                data=[1.0 for _ in self.orchestrator.reqs()],
+                dtype=torch.float32,
+                device=self.orchestrator.device,
+            )
+            .unsqueeze_(1)
+            .repeat(1, self.orchestrator.vocab_size)
+        )
+        self.repetition_penalties = (
+            torch.tensor(
+                data=[
+                    req.sampling_params.repetition_penalty
+                    for req in self.orchestrator.reqs()
+                ],
+                dtype=torch.float32,
+                device=self.orchestrator.device,
+            )
+            .unsqueeze_(1)
+            .expand_as(self.cumulated_repetition_penalties)
+        )
+    def _teardown(self):
+        del self.repetition_penalties
+        del self.cumulated_repetition_penalties
+        self.repetition_penalties = None
+        self.cumulated_repetition_penalties = None
+    def _cumulate_input_tokens(self, input_ids: _TokenIDs):
+        mask = input_ids.occurrence_count() > 0
+        self.cumulated_repetition_penalties[mask] = self.repetition_penalties[mask]
+    def _cumulate_output_tokens(self, output_ids: _TokenIDs):
+        mask = output_ids.occurrence_count() > 0
+        self.cumulated_repetition_penalties[mask] = self.repetition_penalties[mask]
+    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
+        return torch.where(
+            logits > 0,
+            logits / self.cumulated_repetition_penalties,
+            logits * self.cumulated_repetition_penalties,
+        )
+    def _filter(
+        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
+    ):
+        self.repetition_penalties = self.repetition_penalties[indices_tensor_to_keep]
+        self.cumulated_repetition_penalties = self.cumulated_repetition_penalties[
+            indices_tensor_to_keep
+        ]
+    def _merge(self, their: "BatchedRepetitionPenalizer"):
+        self.repetition_penalties = torch.cat(
+            [self.repetition_penalties, their.repetition_penalties], dim=0
+        )
+        self.cumulated_repetition_penalties = torch.cat(
+            [self.cumulated_repetition_penalties, their.cumulated_repetition_penalties],
+            dim=0,
+        )

sglang/srt/sampling_params.py CHANGED Viewed

@@ -23,13 +23,16 @@ _SAMPLING_EPS = 1e-6
 class SamplingParams:
     def __init__(
         self,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
+        min_new_tokens: int = 0,
         stop: Optional[Union[str, List[str]]] = None,
+        stop_token_ids: Optional[List[int]] = [],
         temperature: float = 1.0,
         top_p: float = 1.0,
         top_k: int = -1,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
+        repetition_penalty: float = 1.0,
         ignore_eos: bool = False,
         skip_special_tokens: bool = True,
         spaces_between_special_tokens: bool = True,
@@ -42,8 +45,11 @@ class SamplingParams:
         self.top_k = top_k
         self.frequency_penalty = frequency_penalty
         self.presence_penalty = presence_penalty
+        self.repetition_penalty = repetition_penalty
         self.stop_strs = stop
+        self.stop_token_ids = {*stop_token_ids}
         self.max_new_tokens = max_new_tokens
+        self.min_new_tokens = min_new_tokens
         self.ignore_eos = ignore_eos
         self.skip_special_tokens = skip_special_tokens
         self.spaces_between_special_tokens = spaces_between_special_tokens
@@ -80,23 +86,44 @@ class SamplingParams:
             raise ValueError(
                 "presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}."
             )
+        if not 0.0 <= self.repetition_penalty <= 2.0:
+            raise ValueError(
+                "repetition_penalty must be in (0, 2], got "
+                f"{self.repetition_penalty}."
+            )
+        if not 0 <= self.min_new_tokens:
+            raise ValueError(
+                f"min_new_tokens must be in (0, max_new_tokens], got "
+                f"{self.min_new_tokens}."
+            )
         if self.max_new_tokens is not None:
             if self.max_new_tokens < 0:
                 raise ValueError(
                     f"max_new_tokens must be at least 0, got {self.max_new_tokens}."
                 )
+            if not self.min_new_tokens <= self.max_new_tokens:
+                raise ValueError(
+                    f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
+                    f"{self.min_new_tokens}."
+                )
     def normalize(self, tokenizer):
         # Process stop strings
         if self.stop_strs is None:
             self.stop_strs = []
-            self.stop_str_max_len = 0
+            if self.stop_token_ids is None:
+                self.stop_str_max_len = 0
+            else:
+                self.stop_str_max_len = 1
         else:
             if isinstance(self.stop_strs, str):
                 self.stop_strs = [self.stop_strs]
             stop_str_max_len = 0
             for stop_str in self.stop_strs:
-                stop_str_ids = tokenizer.encode(stop_str, add_special_tokens=False)
-                stop_str_max_len = max(stop_str_max_len, len(stop_str_ids))
+                if tokenizer is not None:
+                    stop_str_ids = tokenizer.encode(stop_str, add_special_tokens=False)
+                    stop_str_max_len = max(stop_str_max_len, len(stop_str_ids))
+                else:
+                    stop_str_max_len = max(stop_str_max_len, len(stop_str))
             self.stop_str_max_len = stop_str_max_len

sglang/srt/server.py CHANGED Viewed

@@ -52,13 +52,15 @@ from sglang.srt.managers.controller_single import (
     start_controller_process as start_controller_process_single,
 )
 from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
-from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
 from sglang.srt.managers.tokenizer_manager import TokenizerManager
 from sglang.srt.openai_api.adapter import (
     load_chat_template_for_openai_api,
     v1_batches,
     v1_chat_completions,
     v1_completions,
+    v1_delete_file,
+    v1_embeddings,
     v1_files_create,
     v1_retrieve_batch,
     v1_retrieve_file,
@@ -73,7 +75,8 @@ from sglang.srt.utils import (
     enable_show_time_cost,
     kill_child_process,
     maybe_set_triton_cache_manager,
-    set_torch_compile_config,
+    prepare_model,
+    prepare_tokenizer,
     set_ulimit,
 )
 from sglang.utils import get_exception_traceback
@@ -97,6 +100,7 @@ async def health() -> Response:
 async def get_model_info():
     result = {
         "model_path": tokenizer_manager.model_path,
+        "is_generation": tokenizer_manager.is_generation,
     }
     return result
@@ -148,6 +152,21 @@ app.post("/generate")(generate_request)
 app.put("/generate")(generate_request)
+async def encode_request(obj: EmbeddingReqInput, request: Request):
+    """Handle an embedding request."""
+    try:
+        ret = await tokenizer_manager.generate_request(obj, request).__anext__()
+        return ret
+    except ValueError as e:
+        return JSONResponse(
+            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
+        )
+app.post("/encode")(encode_request)
+app.put("/encode")(encode_request)
 @app.post("/v1/completions")
 async def openai_v1_completions(raw_request: Request):
     return await v1_completions(tokenizer_manager, raw_request)
@@ -158,6 +177,12 @@ async def openai_v1_chat_completions(raw_request: Request):
     return await v1_chat_completions(tokenizer_manager, raw_request)
+@app.post("/v1/embeddings")
+async def openai_v1_embeddings(raw_request: Request):
+    response = await v1_embeddings(tokenizer_manager, raw_request)
+    return response
 @app.get("/v1/models")
 def available_models():
     """Show available models."""
@@ -175,6 +200,12 @@ async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("bat
     )
+@app.delete("/v1/files/{file_id}")
+async def delete_file(file_id: str):
+    # https://platform.openai.com/docs/api-reference/files/delete
+    return await v1_delete_file(file_id)
 @app.post("/v1/batches")
 async def openai_v1_batches(raw_request: Request):
     return await v1_batches(tokenizer_manager, raw_request)
@@ -228,6 +259,10 @@ def launch_server(
     )
     logger.info(f"{server_args=}")
+    # Use model from www.modelscope.cn, first download the model.
+    server_args.model_path = prepare_model(server_args.model_path)
+    server_args.tokenizer_path = prepare_tokenizer(server_args.tokenizer_path)
     # Launch processes for multi-node tensor parallelism
     if server_args.nnodes > 1:
         if server_args.node_rank != 0:
@@ -340,10 +375,6 @@ def _set_envs_and_config(server_args: ServerArgs):
         # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
         maybe_set_triton_cache_manager()
-    # Set torch compile config
-    if server_args.enable_torch_compile:
-        set_torch_compile_config()
     # Set global chat template
     if server_args.chat_template:
         # TODO: replace this with huggingface transformers template
@@ -353,7 +384,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if not server_args.disable_flashinfer:
         assert_pkg_version(
             "flashinfer",
-            "0.1.3",
+            "0.1.4",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
@@ -367,35 +398,63 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
         headers["Authorization"] = f"Bearer {server_args.api_key}"
     # Wait until the server is launched
+    success = False
     for _ in range(120):
         time.sleep(1)
         try:
-            requests.get(url + "/get_model_info", timeout=5, headers=headers)
+            res = requests.get(url + "/get_model_info", timeout=5, headers=headers)
+            assert res.status_code == 200, f"{res}"
+            success = True
             break
-        except requests.exceptions.RequestException:
+        except (AssertionError, requests.exceptions.RequestException) as e:
+            last_traceback = get_exception_traceback()
             pass
+    model_info = res.json()
+    if not success:
+        if pipe_finish_writer is not None:
+            pipe_finish_writer.send(last_traceback)
+        print(f"Initialization failed. warmup error: {last_traceback}", flush=True)
+        sys.exit(1)
     # Send a warmup request
+    request_name = "/generate" if model_info["is_generation"] else "/encode"
+    max_new_tokens = 8 if model_info["is_generation"] else 1
+    json_data = {
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": max_new_tokens,
+        },
+    }
+    if server_args.skip_tokenizer_init:
+        json_data["input_ids"] = [10, 11, 12]
+    else:
+        json_data["text"] = "The capital city of France is"
     try:
         for _ in range(server_args.dp_size):
             res = requests.post(
-                url + "/generate",
-                json={
-                    "text": "The capital city of France is",
-                    "sampling_params": {
-                        "temperature": 0,
-                        "max_new_tokens": 8,
-                    },
-                },
+                url + request_name,
+                json=json_data,
                 headers=headers,
                 timeout=600,
             )
-            assert res.status_code == 200
+            assert res.status_code == 200, f"{res}"
     except Exception as e:
+        last_traceback = get_exception_traceback()
         if pipe_finish_writer is not None:
-            pipe_finish_writer.send(get_exception_traceback())
-        print(f"Initialization failed. warmup error: {e}", flush=True)
-        raise e
+            pipe_finish_writer.send(last_traceback)
+        print(f"Initialization failed. warmup error: {last_traceback}", flush=True)
+        sys.exit(1)
+    # Print warnings here
+    if server_args.disable_radix_cache and server_args.chunked_prefill_size is not None:
+        logger.warning(
+            "You set both `--disable-radix-cache` and `--chunked-prefill-size`. "
+            "This combination is an experimental feature and we noticed it can lead to "
+            "wrong generation results. If you want to use chunked prefill, it is recommended "
+            "not using `--disable-radix-cache`."
+        )
     logger.info("The server is fired up and ready to roll!")
     if pipe_finish_writer is not None:
@@ -516,5 +575,18 @@ class Runtime:
         )
         return json.dumps(response.json())
+    def encode(
+        self,
+        prompt: str,
+    ):
+        json_data = {
+            "text": prompt,
+        }
+        response = requests.post(
+            self.url + "/encode",
+            json=json_data,
+        )
+        return json.dumps(response.json())
     def __del__(self):
         self.shutdown()

sglang/srt/server_args.py CHANGED Viewed

@@ -27,6 +27,7 @@ class ServerArgs:
     model_path: str
     tokenizer_path: Optional[str] = None
     tokenizer_mode: str = "auto"
+    skip_tokenizer_init: bool = False
     load_format: str = "auto"
     dtype: str = "auto"
     trust_remote_code: bool = True
@@ -42,10 +43,11 @@ class ServerArgs:
     # Memory and scheduling
     mem_fraction_static: Optional[float] = None
-    max_prefill_tokens: Optional[int] = None
     max_running_requests: Optional[int] = None
     max_num_reqs: Optional[int] = None
     max_total_tokens: Optional[int] = None
+    chunked_prefill_size: int = -1
+    max_prefill_tokens: int = 16384
     schedule_policy: str = "lpm"
     schedule_conservativeness: float = 1.0
@@ -62,15 +64,12 @@ class ServerArgs:
     # Other
     api_key: Optional[str] = None
-    file_storage_pth: str = "SGlang_storage"
+    file_storage_pth: str = "SGLang_storage"
     # Data parallelism
     dp_size: int = 1
     load_balance_method: str = "round_robin"
-    # Chunked Prefill
-    chunked_prefill_size: Optional[int] = None
     # Optimization/debug options
     disable_flashinfer: bool = False
     disable_flashinfer_sampling: bool = False
@@ -96,6 +95,10 @@ class ServerArgs:
         if self.served_model_name is None:
             self.served_model_name = self.model_path
+        if self.chunked_prefill_size <= 0:
+            # Disable chunked prefill
+            self.chunked_prefill_size = None
         if self.mem_fraction_static is None:
             if self.tp_size >= 16:
                 self.mem_fraction_static = 0.79
@@ -107,6 +110,7 @@ class ServerArgs:
                 self.mem_fraction_static = 0.87
             else:
                 self.mem_fraction_static = 0.88
         if isinstance(self.additional_ports, int):
             self.additional_ports = [self.additional_ports]
         elif self.additional_ports is None:
@@ -151,6 +155,11 @@ class ServerArgs:
             "tokenizer if available, and 'slow' will "
             "always use the slow tokenizer.",
         )
+        parser.add_argument(
+            "--skip-tokenizer-init",
+            action="store_true",
+            help="If set, skip init tokenizer and pass input_ids in generate request",
+        )
         parser.add_argument(
             "--load-format",
             type=str,
@@ -226,12 +235,6 @@ class ServerArgs:
             default=ServerArgs.mem_fraction_static,
             help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
         )
-        parser.add_argument(
-            "--max-prefill-tokens",
-            type=int,
-            default=ServerArgs.max_prefill_tokens,
-            help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
-        )
         parser.add_argument(
             "--max-running-requests",
             type=int,
@@ -250,6 +253,18 @@ class ServerArgs:
             default=ServerArgs.max_total_tokens,
             help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
         )
+        parser.add_argument(
+            "--chunked-prefill-size",
+            type=int,
+            default=ServerArgs.chunked_prefill_size,
+            help="The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill",
+        )
+        parser.add_argument(
+            "--max-prefill-tokens",
+            type=int,
+            default=ServerArgs.max_prefill_tokens,
+            help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
+        )
         parser.add_argument(
             "--schedule-policy",
             type=str,
@@ -264,6 +279,7 @@ class ServerArgs:
             help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
         )
         parser.add_argument(
+            "--tensor-parallel-size",
             "--tp-size",
             type=int,
             default=ServerArgs.tp_size,
@@ -318,6 +334,7 @@ class ServerArgs:
         # Data parallelism
         parser.add_argument(
+            "--data-parallel-size",
             "--dp-size",
             type=int,
             default=ServerArgs.dp_size,
@@ -345,14 +362,6 @@ class ServerArgs:
         )
         parser.add_argument("--node-rank", type=int, help="The node rank.")
-        # Chunked prefill
-        parser.add_argument(
-            "--chunked-prefill-size",
-            type=int,
-            default=ServerArgs.chunked_prefill_size,
-            help="The size of the chunked prefill.",
-        )
         # Optimization/debug options
         parser.add_argument(
             "--disable-flashinfer",
@@ -413,6 +422,8 @@ class ServerArgs:
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
+        args.tp_size = args.tensor_parallel_size
+        args.dp_size = args.data_parallel_size
         attrs = [attr.name for attr in dataclasses.fields(cls)]
         return cls(**{attr: getattr(args, attr) for attr in attrs})

sglang/srt/utils.py CHANGED Viewed

@@ -197,6 +197,8 @@ def allocate_init_ports(
 def get_int_token_logit_bias(tokenizer, vocab_size):
     """Get the logit bias for integer-only tokens."""
     # a bug when model's vocab size > tokenizer.vocab_size
+    if tokenizer == None:
+        return [-1e5] * vocab_size
     vocab_size = tokenizer.vocab_size
     logit_bias = np.zeros(vocab_size, dtype=np.float32)
     for t_id in range(vocab_size):
@@ -223,6 +225,15 @@ def is_multimodal_model(model):
     raise ValueError("unrecognized type")
+def is_generation_model(model_architectures):
+    if (
+        "LlamaEmbeddingModel" in model_architectures
+        or "MistralModel" in model_architectures
+    ):
+        return False
+    return True
 def decode_video_base64(video_base64):
     from PIL import Image
@@ -622,19 +633,6 @@ def receive_addrs(model_port_args, server_args):
     dist.destroy_process_group()
-def set_torch_compile_config():
-    # The following configurations are for torch compile optimizations
-    import torch._dynamo.config
-    import torch._inductor.config
-    torch._inductor.config.coordinate_descent_tuning = True
-    torch._inductor.config.triton.unique_kernel_names = True
-    torch._inductor.config.fx_graph_cache = True  # Experimental feature to reduce compilation times, will be on by default in future
-    # FIXME: tmp workaround
-    torch._dynamo.config.accumulated_cache_size_limit = 256
 def set_ulimit(target_soft_limit=65535):
     resource_type = resource.RLIMIT_NOFILE
     current_soft, current_hard = resource.getrlimit(resource_type)
@@ -705,3 +703,23 @@ def add_api_key_middleware(app, api_key):
         if request.headers.get("Authorization") != "Bearer " + api_key:
             return JSONResponse(content={"error": "Unauthorized"}, status_code=401)
         return await call_next(request)
+def prepare_model(model_path):
+    if "SGLANG_USE_MODELSCOPE" in os.environ:
+        if not os.path.exists(model_path):
+            from modelscope import snapshot_download
+            return snapshot_download(model_path)
+    return model_path
+def prepare_tokenizer(tokenizer_path):
+    if "SGLANG_USE_MODELSCOPE" in os.environ:
+        if not os.path.exists(tokenizer_path):
+            from modelscope import snapshot_download
+            return snapshot_download(
+                tokenizer_path, ignore_patterns=["*.bin", "*.safetensors"]
+            )
+    return tokenizer_path

sglang/test/run_eval.py CHANGED Viewed

@@ -16,6 +16,8 @@ from sglang.test.simple_eval_common import (
 def run_eval(args):
+    set_ulimit()
     if "OPENAI_API_KEY" not in os.environ:
         os.environ["OPENAI_API_KEY"] = "EMPTY"
@@ -39,6 +41,14 @@ def run_eval(args):
         eval_obj = MathEval(
             filename, equality_checker, args.num_examples, args.num_threads
         )
+    elif args.eval_name == "mgsm":
+        from sglang.test.simple_eval_mgsm import MGSMEval
+        eval_obj = MGSMEval(args.num_examples, args.num_threads)
+    elif args.eval_name == "mgsm_en":
+        from sglang.test.simple_eval_mgsm import MGSMEval
+        eval_obj = MGSMEval(args.num_examples, args.num_threads, languages=["en"])
     elif args.eval_name == "gpqa":
         from sglang.test.simple_eval_gpqa import GPQAEval
@@ -109,7 +119,6 @@ if __name__ == "__main__":
     parser.add_argument("--eval-name", type=str, default="mmlu")
     parser.add_argument("--num-examples", type=int)
     parser.add_argument("--num-threads", type=int, default=512)
-    set_ulimit()
     args = parser.parse_args()
     run_eval(args)

sglang 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

sglang 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl