PyPI - sglang - Versions diffs - 0.2.14__py3-none-any.whl → 0.2.14.post2__py3-none-any.whl - Mend

sglang 0.2.14py3-none-any.whl → 0.2.14.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

sglang/launch_server_llavavid.py +26 -0
sglang/srt/constrained/fsm_cache.py +11 -2
sglang/srt/constrained/jump_forward.py +1 -0
sglang/srt/hf_transformers_utils.py +0 -149
sglang/srt/layers/activation.py +93 -11
sglang/srt/layers/layernorm.py +47 -4
sglang/srt/layers/logits_processor.py +4 -4
sglang/srt/layers/sampler.py +15 -68
sglang/srt/managers/io_struct.py +5 -4
sglang/srt/managers/schedule_batch.py +20 -25
sglang/srt/managers/tokenizer_manager.py +74 -61
sglang/srt/managers/tp_worker.py +49 -43
sglang/srt/model_executor/cuda_graph_runner.py +17 -31
sglang/srt/model_executor/forward_batch_info.py +9 -26
sglang/srt/model_executor/model_runner.py +20 -17
sglang/srt/models/chatglm.py +13 -5
sglang/srt/models/commandr.py +1 -5
sglang/srt/models/dbrx.py +1 -5
sglang/srt/models/deepseek.py +1 -5
sglang/srt/models/deepseek_v2.py +1 -5
sglang/srt/models/gemma.py +3 -7
sglang/srt/models/gemma2.py +2 -56
sglang/srt/models/gpt_bigcode.py +2 -6
sglang/srt/models/grok.py +10 -8
sglang/srt/models/internlm2.py +1 -5
sglang/srt/models/llama2.py +6 -11
sglang/srt/models/llama_classification.py +2 -6
sglang/srt/models/llama_embedding.py +3 -4
sglang/srt/models/llava.py +69 -91
sglang/srt/models/llavavid.py +40 -86
sglang/srt/models/minicpm.py +1 -5
sglang/srt/models/mixtral.py +1 -5
sglang/srt/models/mixtral_quant.py +1 -5
sglang/srt/models/qwen.py +2 -5
sglang/srt/models/qwen2.py +5 -10
sglang/srt/models/qwen2_moe.py +21 -24
sglang/srt/models/stablelm.py +1 -5
sglang/srt/models/yivl.py +2 -7
sglang/srt/openai_api/adapter.py +85 -4
sglang/srt/openai_api/protocol.py +2 -0
sglang/srt/sampling/sampling_batch_info.py +1 -74
sglang/srt/sampling/sampling_params.py +4 -0
sglang/srt/server.py +11 -4
sglang/srt/utils.py +18 -33
sglang/test/runners.py +2 -2
sglang/test/test_layernorm.py +53 -1
sglang/version.py +1 -1
{sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/METADATA +11 -5
{sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/RECORD +52 -51
{sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/WHEEL +1 -1
{sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/LICENSE +0 -0
{sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/top_level.txt +0 -0

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -275,10 +275,12 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
         end_point = batch_storage[batch_id].endpoint
         file_request_list = []
         all_requests = []
+        request_ids = []
         for line in lines:
             request_data = json.loads(line)
             file_request_list.append(request_data)
             body = request_data["body"]
+            request_ids.append(request_data["custom_id"])
             # Although streaming is supported for standalone completions, it is not supported in
             # batch mode (multiple completions in single request).
@@ -289,12 +291,16 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
                 all_requests.append(ChatCompletionRequest(**body))
             elif end_point == "/v1/completions":
                 all_requests.append(CompletionRequest(**body))
         if end_point == "/v1/chat/completions":
             adapted_request, request = v1_chat_generate_request(
-                all_requests, tokenizer_manager
+                all_requests, tokenizer_manager, request_ids=request_ids
             )
         elif end_point == "/v1/completions":
-            adapted_request, request = v1_generate_request(all_requests)
+            adapted_request, request = v1_generate_request(
+                all_requests, request_ids=request_ids
+            )
         try:
             ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
             if not isinstance(ret, list):
@@ -326,6 +332,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
             }
             all_ret.append(response_json)
             completed_requests += 1
         # Write results to a new file
         output_file_id = f"backend_result_file-{uuid.uuid4()}"
         global storage_dir
@@ -372,6 +379,72 @@ async def v1_retrieve_batch(batch_id: str):
     return batch_response
+async def v1_cancel_batch(tokenizer_manager, batch_id: str):
+    # Retrieve the batch job from the in-memory storage
+    batch_response = batch_storage.get(batch_id)
+    if batch_response is None:
+        raise HTTPException(status_code=404, detail="Batch not found")
+    # Only do cancal when status is "validating" or "in_progress"
+    if batch_response.status in ["validating", "in_progress"]:
+        # Start cancelling the batch asynchronously
+        asyncio.create_task(
+            cancel_batch(
+                tokenizer_manager=tokenizer_manager,
+                batch_id=batch_id,
+                input_file_id=batch_response.input_file_id,
+            )
+        )
+        # Update batch status to "cancelling"
+        batch_response.status = "cancelling"
+        return batch_response
+    else:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Current status is {batch_response.status}, no need to cancel",
+        )
+async def cancel_batch(tokenizer_manager, batch_id: str, input_file_id: str):
+    try:
+        # Update the batch status to "cancelling"
+        batch_storage[batch_id].status = "cancelling"
+        # Retrieve the input file content
+        input_file_request = file_id_request.get(input_file_id)
+        if not input_file_request:
+            raise ValueError("Input file not found")
+        # Parse the JSONL file and process each request
+        input_file_path = file_id_storage.get(input_file_id)
+        with open(input_file_path, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        file_request_list = []
+        request_ids = []
+        for line in lines:
+            request_data = json.loads(line)
+            file_request_list.append(request_data)
+            request_ids.append(request_data["custom_id"])
+        # Cancel requests by request_ids
+        for rid in request_ids:
+            tokenizer_manager.abort_request(rid=rid)
+        retrieve_batch = batch_storage[batch_id]
+        retrieve_batch.status = "cancelled"
+    except Exception as e:
+        logger.error("error in SGLang:", e)
+        # Update batch status to "failed"
+        retrieve_batch = batch_storage[batch_id]
+        retrieve_batch.status = "failed"
+        retrieve_batch.failed_at = int(time.time())
+        retrieve_batch.errors = {"message": str(e)}
 async def v1_retrieve_file(file_id: str):
     # Retrieve the batch job from the in-memory storage
     file_response = file_id_response.get(file_id)
@@ -392,7 +465,9 @@ async def v1_retrieve_file_content(file_id: str):
     return StreamingResponse(iter_file(), media_type="application/octet-stream")
-def v1_generate_request(all_requests: List[CompletionRequest]):
+def v1_generate_request(
+    all_requests: List[CompletionRequest], request_ids: List[str] = None
+):
     prompts = []
     sampling_params_list = []
     return_logprobs = []
@@ -434,6 +509,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]):
                 "frequency_penalty": request.frequency_penalty,
                 "repetition_penalty": request.repetition_penalty,
                 "regex": request.regex,
+                "json_schema": request.json_schema,
                 "n": request.n,
                 "ignore_eos": request.ignore_eos,
             }
@@ -463,6 +539,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]):
         logprob_start_len=logprob_start_lens,
         return_text_in_logprobs=True,
         stream=all_requests[0].stream,
+        rid=request_ids,
     )
     if len(all_requests) == 1:
@@ -745,7 +822,9 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
 def v1_chat_generate_request(
-    all_requests: List[ChatCompletionRequest], tokenizer_manager
+    all_requests: List[ChatCompletionRequest],
+    tokenizer_manager,
+    request_ids: List[str] = None,
 ):
     input_ids = []
     sampling_params_list = []
@@ -802,6 +881,7 @@ def v1_chat_generate_request(
                 "frequency_penalty": request.frequency_penalty,
                 "repetition_penalty": request.repetition_penalty,
                 "regex": request.regex,
+                "json_schema": request.json_schema,
                 "n": request.n,
             }
         )
@@ -832,6 +912,7 @@ def v1_chat_generate_request(
         top_logprobs_num=top_logprobs_nums,
         stream=all_requests[0].stream,
         return_text_in_logprobs=True,
+        rid=request_ids,
     )
     if len(all_requests) == 1:
         return adapted_request, all_requests[0]

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -161,6 +161,7 @@ class CompletionRequest(BaseModel):
     # Extra parameters for SRT backend only and will be ignored by OpenAI models.
     regex: Optional[str] = None
+    json_schema: Optional[str] = None
     ignore_eos: Optional[bool] = False
     min_tokens: Optional[int] = 0
     repetition_penalty: Optional[float] = 1.0
@@ -262,6 +263,7 @@ class ChatCompletionRequest(BaseModel):
     # Extra parameters for SRT backend only and will be ignored by OpenAI models.
     regex: Optional[str] = None
+    json_schema: Optional[str] = None
     min_tokens: Optional[int] = 0
     repetition_penalty: Optional[float] = 1.0
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -21,63 +21,10 @@ class SamplingBatchInfo:
     top_ps: torch.Tensor = None
     top_ks: torch.Tensor = None
     min_ps: torch.Tensor = None
-    # Dispatch in CUDA graph
-    need_min_p_sampling: bool = False
-    # Bias Tensors
+    penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
     logit_bias: torch.Tensor = None
     vocab_mask: torch.Tensor = None
-    # Penalizer
-    penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
-    linear_penalties: torch.Tensor = None
-    scaling_penalties: torch.Tensor = None
-    def has_bias(self):
-        return (
-            self.logit_bias is not None
-            or self.vocab_mask is not None
-            or self.linear_penalties is not None
-            or self.scaling_penalties is not None
-        )
-    @classmethod
-    def dummy_one(cls, max_bs: int, vocab_size: int):
-        ret = cls(vocab_size=vocab_size)
-        ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
-        ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
-        ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
-        ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda")
-        return ret
-    def __getitem__(self, key):
-        if isinstance(key, slice):
-            # NOTE: We do not use cuda graph when there is bias tensors
-            assert not self.has_bias()
-            return SamplingBatchInfo(
-                vocab_size=self.vocab_size,
-                temperatures=self.temperatures[key],
-                top_ps=self.top_ps[key],
-                top_ks=self.top_ks[key],
-                min_ps=self.min_ps[key],
-                need_min_p_sampling=self.need_min_p_sampling,
-            )
-        else:
-            raise NotImplementedError
-    def inplace_assign(self, bs: int, other: SamplingBatchInfo):
-        # NOTE: We do not use cuda graph when there is bias tensors
-        assert not self.has_bias()
-        self.vocab_size = other.vocab_size
-        self.need_min_p_sampling = other.need_min_p_sampling
-        self.temperatures[:bs] = other.temperatures
-        self.top_ps[:bs] = other.top_ps
-        self.top_ks[:bs] = other.top_ks
-        self.min_ps[:bs] = other.min_ps
     @classmethod
     def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
         device = "cuda"
@@ -98,7 +45,6 @@ class SamplingBatchInfo:
         ret.min_ps = torch.tensor(
             [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
         )
-        ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs)
         # Each penalizers will do nothing if they evaluate themselves as not required by looking at
         # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
@@ -126,25 +72,6 @@ class SamplingBatchInfo:
         return ret
-    def prepare_penalties(self):
-        self.scaling_penalties = None
-        self.linear_penalties = None
-        for penalizer in self.penalizer_orchestrator.penalizers.values():
-            if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer):
-                if penalizer.is_prepared():
-                    self.scaling_penalties = penalizer.cumulated_repetition_penalties
-            else:
-                if penalizer.is_prepared():
-                    if self.linear_penalties is None:
-                        bs = self.penalizer_orchestrator.batch.batch_size()
-                        self.linear_penalties = torch.zeros(
-                            (bs, self.vocab_size),
-                            dtype=torch.float32,
-                            device="cuda",
-                        )
-                    self.linear_penalties = penalizer.apply(self.linear_penalties)
     def update_regex_vocab_mask(self, batch: ScheduleBatch):
         bs, reqs = batch.batch_size(), batch.reqs
         device = "cuda"

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -39,6 +39,7 @@ class SamplingParams:
         spaces_between_special_tokens: bool = True,
         regex: Optional[str] = None,
         n: int = 1,
+        json_schema: Optional[str] = None,
     ) -> None:
         self.temperature = temperature
         self.top_p = top_p
@@ -56,6 +57,7 @@ class SamplingParams:
         self.spaces_between_special_tokens = spaces_between_special_tokens
         self.regex = regex
         self.n = n
+        self.json_schema = json_schema
         # Process some special cases
         if self.temperature < _SAMPLING_EPS:
@@ -106,6 +108,8 @@ class SamplingParams:
                     f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
                     f"{self.min_new_tokens}."
                 )
+        if self.regex is not None and self.json_schema is not None:
+            raise ValueError("regex and json_schema cannot be both set.")
     def normalize(self, tokenizer):
         # Process stop strings

sglang/srt/server.py CHANGED Viewed

@@ -59,6 +59,7 @@ from sglang.srt.managers.tokenizer_manager import TokenizerManager
 from sglang.srt.openai_api.adapter import (
     load_chat_template_for_openai_api,
     v1_batches,
+    v1_cancel_batch,
     v1_chat_completions,
     v1_completions,
     v1_delete_file,
@@ -246,6 +247,12 @@ async def openai_v1_batches(raw_request: Request):
     return await v1_batches(tokenizer_manager, raw_request)
+@app.post("/v1/batches/{batch_id}/cancel")
+async def cancel_batches(batch_id: str):
+    # https://platform.openai.com/docs/api-reference/batch/cancel
+    return await v1_cancel_batch(tokenizer_manager, batch_id)
 @app.get("/v1/batches/{batch_id}")
 async def retrieve_batch(batch_id: str):
     return await v1_retrieve_batch(batch_id)
@@ -328,12 +335,12 @@ def launch_server(
     pipe_detoken_reader, pipe_detoken_writer = mp.Pipe(duplex=False)
     if server_args.dp_size == 1:
-        start_process = start_controller_process_single
+        start_controller_process = start_controller_process_single
     else:
-        start_process = start_controller_process_multi
+        start_controller_process = start_controller_process_multi
     proc_controller = mp.Process(
-        target=start_process,
+        target=start_controller_process,
         args=(server_args, port_args, pipe_controller_writer, model_overide_args),
     )
     proc_controller.start()
@@ -414,7 +421,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if not server_args.disable_flashinfer:
         assert_pkg_version(
             "flashinfer",
-            "0.1.5",
+            "0.1.6",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",

sglang/srt/utils.py CHANGED Viewed

@@ -26,7 +26,7 @@ import struct
 import time
 from importlib.metadata import PackageNotFoundError, version
 from io import BytesIO
-from typing import List, Optional
+from typing import List, Optional, Union
 import numpy as np
 import psutil
@@ -193,35 +193,16 @@ def allocate_init_ports(
     return ret_ports[0], ret_ports[1:num_ports_needed]
-def get_int_token_logit_bias(tokenizer, vocab_size):
-    """Get the logit bias for integer-only tokens."""
-    # a bug when model's vocab size > tokenizer.vocab_size
-    if tokenizer == None:
-        return [-1e5] * vocab_size
-    vocab_size = tokenizer.vocab_size
-    logit_bias = np.zeros(vocab_size, dtype=np.float32)
-    for t_id in range(vocab_size):
-        ss = tokenizer.decode([t_id]).strip()
-        if not (ss.isdigit() or len(ss) == 0 or t_id == tokenizer.eos_token_id):
-            logit_bias[t_id] = -1e5
-    return logit_bias
-def is_multimodal_model(model):
-    from sglang.srt.model_config import ModelConfig
-    if isinstance(model, str):
-        model = model.lower()
-        return "llava" in model or "yi-vl" in model or "llava-next" in model
-    if isinstance(model, ModelConfig):
-        model_path = model.path.lower()
-        return (
-            "llava" in model_path or "yi-vl" in model_path or "llava-next" in model_path
-        )
-    raise ValueError("unrecognized type")
+def is_multimodal_model(model_architectures):
+    if (
+        "LlavaLlamaForCausalLM" in model_architectures
+        or "LlavaQwenForCausalLM" in model_architectures
+        or "LlavaMistralForCausalLM" in model_architectures
+        or "LlavaVidForCausalLM" in model_architectures
+    ):
+        return True
+    else:
+        return False
 def is_generation_model(model_architectures, is_embedding: bool = False):
@@ -317,12 +298,14 @@ def decode_video_base64(video_base64):
         )  # Return an empty array and size tuple if no frames were found
-def load_image(image_file):
+def load_image(image_file: Union[str, bytes]):
     from PIL import Image
     image = image_size = None
-    if image_file.startswith("http://") or image_file.startswith("https://"):
+    if isinstance(image_file, bytes):
+        image = Image.open(BytesIO(image_file))
+    elif image_file.startswith("http://") or image_file.startswith("https://"):
         timeout = int(os.getenv("REQUEST_TIMEOUT", "3"))
         response = requests.get(image_file, timeout=timeout)
         image = Image.open(BytesIO(response.content))
@@ -334,8 +317,10 @@ def load_image(image_file):
     elif image_file.startswith("video:"):
         image_file = image_file.replace("video:", "")
         image, image_size = decode_video_base64(image_file)
-    else:
+    elif isinstance(image_file, str):
         image = Image.open(BytesIO(base64.b64decode(image_file)))
+    else:
+        raise ValueError(f"Invalid image: {image}")
     return image, image_size

sglang/test/runners.py CHANGED Viewed

@@ -30,7 +30,7 @@ DEFAULT_PROMPTS = [
     # the output of gemma-2-2b from SRT is unstable on the commented prompt
     # "The capital of France is",
     "Apple is red. Banana is Yellow. " * 800 + "Apple is",
-    "The capital of the United Kindom is",
+    "The capital of the United Kingdom is",
     "Today is a sunny day and I like",
     "AI is a field of computer science focused on",
 ]
@@ -180,7 +180,7 @@ class SRTRunner:
             tp_size=tp_size,
             dtype=get_dtype_str(torch_dtype),
             port=port,
-            mem_fraction_static=0.69,
+            mem_fraction_static=0.7,
             trust_remote_code=False,
             is_embedding=not self.is_generation,
         )

sglang/test/test_layernorm.py CHANGED Viewed

@@ -3,7 +3,7 @@ import unittest
 import torch
-from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.layernorm import GemmaRMSNorm, RMSNorm
 class TestRMSNorm(unittest.TestCase):
@@ -56,5 +56,57 @@ class TestRMSNorm(unittest.TestCase):
                 self._run_rms_norm_test(*params)
+class TestGemmaRMSNorm(unittest.TestCase):
+    DTYPES = [torch.half, torch.bfloat16]
+    NUM_TOKENS = [7, 83, 4096]
+    HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
+    ADD_RESIDUAL = [False, True]
+    SEEDS = [0]
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+    def _run_gemma_rms_norm_test(
+        self, num_tokens, hidden_size, add_residual, dtype, seed
+    ):
+        torch.manual_seed(seed)
+        layer = GemmaRMSNorm(hidden_size).to(dtype=dtype)
+        layer.weight.data.normal_(mean=1.0, std=0.1)
+        scale = 1 / (2 * hidden_size)
+        x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
+        residual = torch.randn_like(x) * scale if add_residual else None
+        with torch.inference_mode():
+            ref_out = layer.forward_native(x, residual)
+            out = layer(x, residual)
+        if add_residual:
+            self.assertTrue(torch.allclose(out[0], ref_out[0], atol=1e-3, rtol=1e-3))
+            self.assertTrue(torch.allclose(out[1], ref_out[1], atol=1e-3, rtol=1e-3))
+        else:
+            self.assertTrue(torch.allclose(out, ref_out, atol=1e-3, rtol=1e-3))
+    def test_gemma_rms_norm(self):
+        for params in itertools.product(
+            self.NUM_TOKENS,
+            self.HIDDEN_SIZES,
+            self.ADD_RESIDUAL,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                num_tokens=params[0],
+                hidden_size=params[1],
+                add_residual=params[2],
+                dtype=params[3],
+                seed=params[4],
+            ):
+                self._run_gemma_rms_norm_test(*params)
 if __name__ == "__main__":
     unittest.main(verbosity=2)

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.14"
1	+ __version__ = "0.2.14.post2"

{sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.2.14
+Version: 0.2.14.post2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License: Apache License
                                    Version 2.0, January 2004
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
+git clone -b v0.2.14.post2 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -339,6 +339,7 @@ docker run --gpus all \
 ### Method 4: Using docker compose
 <details>
+<summary>More</summary>
 > This method is recommended if you plan to serve it as a service.
 > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
@@ -350,6 +351,7 @@ docker run --gpus all \
 ### Method 5: Run on Kubernetes or Clouds with SkyPilot
 <details>
+<summary>More</summary>
 To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
@@ -389,7 +391,7 @@ sky status --endpoint 30000 sglang
 ### Common Notes
-- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
+- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
 - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
 ## Backend: SGLang Runtime (SRT)
@@ -494,7 +496,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - Qwen / Qwen 2 / Qwen 2 MoE
 - DeepSeek / DeepSeek 2
 - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
-  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
+  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
   - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
 - LLaVA 1.5 / 1.6 / NeXT
   - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
@@ -518,6 +520,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
 #### Use Models From ModelScope
 <details>
+<summary>More</summary>
 To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
 ```
@@ -532,6 +535,7 @@ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen
 #### Run Llama 3.1 405B
 <details>
+<summary>More</summary>
 ```bash
 # Run 405B (fp8) on a single node
@@ -549,7 +553,9 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
 ### Benchmark Performance
-- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
+- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
+  Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle.
+  A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, please use `sglang.bench_serving` instead.
   ```
   python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
   ```

sglang 0.2.14__py3-none-any.whl → 0.2.14.post2__py3-none-any.whl

sglang 0.2.14py3-none-any.whl → 0.2.14.post2py3-none-any.whl