PyPI - sglang - Versions diffs - 0.2.14__py3-none-any.whl → 0.2.14.post1__py3-none-any.whl - Mend

sglang 0.2.14py3-none-any.whl → 0.2.14.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

sglang/srt/constrained/fsm_cache.py +11 -2
sglang/srt/constrained/jump_forward.py +1 -0
sglang/srt/layers/activation.py +83 -7
sglang/srt/layers/layernorm.py +0 -3
sglang/srt/layers/logits_processor.py +4 -4
sglang/srt/layers/sampler.py +15 -68
sglang/srt/managers/schedule_batch.py +15 -20
sglang/srt/managers/tp_worker.py +40 -33
sglang/srt/model_executor/cuda_graph_runner.py +17 -31
sglang/srt/model_executor/forward_batch_info.py +1 -8
sglang/srt/model_executor/model_runner.py +5 -11
sglang/srt/models/chatglm.py +12 -4
sglang/srt/models/commandr.py +1 -5
sglang/srt/models/dbrx.py +1 -5
sglang/srt/models/deepseek.py +1 -5
sglang/srt/models/deepseek_v2.py +1 -5
sglang/srt/models/gemma.py +1 -5
sglang/srt/models/gemma2.py +1 -5
sglang/srt/models/gpt_bigcode.py +2 -6
sglang/srt/models/grok.py +1 -5
sglang/srt/models/internlm2.py +1 -5
sglang/srt/models/llama2.py +3 -7
sglang/srt/models/llama_classification.py +2 -2
sglang/srt/models/minicpm.py +1 -5
sglang/srt/models/mixtral.py +1 -5
sglang/srt/models/mixtral_quant.py +1 -5
sglang/srt/models/qwen.py +2 -5
sglang/srt/models/qwen2.py +2 -6
sglang/srt/models/qwen2_moe.py +14 -5
sglang/srt/models/stablelm.py +1 -5
sglang/srt/openai_api/adapter.py +85 -4
sglang/srt/openai_api/protocol.py +2 -0
sglang/srt/sampling/sampling_batch_info.py +1 -74
sglang/srt/sampling/sampling_params.py +4 -0
sglang/srt/server.py +8 -1
sglang/test/runners.py +1 -1
sglang/version.py +1 -1
{sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/METADATA +10 -4
{sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/RECORD +42 -42
{sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/WHEEL +1 -1
{sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/LICENSE +0 -0
{sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/top_level.txt +0 -0

sglang/srt/models/mixtral.py CHANGED Viewed

@@ -41,7 +41,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -300,7 +299,6 @@ class MixtralForCausalLM(nn.Module):
         self.model = MixtralModel(config, quant_config=quant_config, prefix="model")
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
     def forward(
         self,
@@ -310,11 +308,9 @@ class MixtralForCausalLM(nn.Module):
         input_embeds: torch.Tensor = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.lm_head.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang/srt/models/mixtral_quant.py CHANGED Viewed

@@ -45,7 +45,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -334,7 +333,6 @@ class QuantMixtralForCausalLM(nn.Module):
         self.model = MixtralModel(config, quant_config=quant_config)
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
     @torch.no_grad()
     def forward(
@@ -345,11 +343,9 @@ class QuantMixtralForCausalLM(nn.Module):
         input_embeds: torch.Tensor = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.lm_head.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang/srt/models/qwen.py CHANGED Viewed

@@ -39,7 +39,6 @@ from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -252,7 +251,6 @@ class QWenLMHeadModel(nn.Module):
         vocab_size = ((config.vocab_size + 63) // 64) * 64
         self.lm_head = ParallelLMHead(vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
     @torch.no_grad()
     def forward(
@@ -262,11 +260,10 @@ class QWenLMHeadModel(nn.Module):
         input_metadata: InputMetadata,
     ):
         hidden_states = self.transformer(input_ids, positions, input_metadata)
-        logits_output = self.logits_processor(
+        next_tokens = self.logits_processor(
             input_ids, hidden_states, self.lm_head.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
+        return next_tokens
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang/srt/models/qwen2.py CHANGED Viewed

@@ -38,9 +38,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
-from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
 Qwen2Config = None
@@ -277,7 +276,6 @@ class Qwen2ForCausalLM(nn.Module):
         self.model = Qwen2Model(config, quant_config=quant_config)
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
         self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
     @torch.no_grad()
@@ -291,11 +289,9 @@ class Qwen2ForCausalLM(nn.Module):
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
         if not get_embedding:
-            logits_output = self.logits_processor(
+            return self.logits_processor(
                 input_ids, hidden_states, self.lm_head.weight, input_metadata
             )
-            sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-            return sample_output, logits_output
         else:
             return self.pooler(hidden_states, input_metadata)

sglang/srt/models/qwen2_moe.py CHANGED Viewed

@@ -35,8 +35,10 @@ from vllm.model_executor.layers.linear import (
     ReplicatedLinear,
     RowParallelLinear,
 )
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
@@ -47,7 +49,6 @@ from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -365,7 +366,6 @@ class Qwen2MoeForCausalLM(nn.Module):
             config.vocab_size, config.hidden_size, quant_config=quant_config
         )
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
     @torch.no_grad()
     def forward(
@@ -376,11 +376,20 @@ class Qwen2MoeForCausalLM(nn.Module):
         input_embeds: torch.Tensor = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.lm_head.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
+    def compute_logits(
+        self,
+        input_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        logits = self.logits_processor(
+            input_ids, hidden_states, self.lm_head.weight, input_metadata
+        )
+        return logits
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang/srt/models/stablelm.py CHANGED Viewed

@@ -40,7 +40,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -250,7 +249,6 @@ class StableLmForCausalLM(nn.Module):
         self.model = StableLMEpochModel(config, quant_config=quant_config)
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
     @torch.no_grad()
     def forward(
@@ -261,11 +259,9 @@ class StableLmForCausalLM(nn.Module):
         input_embeds: torch.Tensor = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.lm_head.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -275,10 +275,12 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
         end_point = batch_storage[batch_id].endpoint
         file_request_list = []
         all_requests = []
+        request_ids = []
         for line in lines:
             request_data = json.loads(line)
             file_request_list.append(request_data)
             body = request_data["body"]
+            request_ids.append(request_data["custom_id"])
             # Although streaming is supported for standalone completions, it is not supported in
             # batch mode (multiple completions in single request).
@@ -289,12 +291,16 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
                 all_requests.append(ChatCompletionRequest(**body))
             elif end_point == "/v1/completions":
                 all_requests.append(CompletionRequest(**body))
         if end_point == "/v1/chat/completions":
             adapted_request, request = v1_chat_generate_request(
-                all_requests, tokenizer_manager
+                all_requests, tokenizer_manager, request_ids=request_ids
             )
         elif end_point == "/v1/completions":
-            adapted_request, request = v1_generate_request(all_requests)
+            adapted_request, request = v1_generate_request(
+                all_requests, request_ids=request_ids
+            )
         try:
             ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
             if not isinstance(ret, list):
@@ -326,6 +332,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
             }
             all_ret.append(response_json)
             completed_requests += 1
         # Write results to a new file
         output_file_id = f"backend_result_file-{uuid.uuid4()}"
         global storage_dir
@@ -372,6 +379,72 @@ async def v1_retrieve_batch(batch_id: str):
     return batch_response
+async def v1_cancel_batch(tokenizer_manager, batch_id: str):
+    # Retrieve the batch job from the in-memory storage
+    batch_response = batch_storage.get(batch_id)
+    if batch_response is None:
+        raise HTTPException(status_code=404, detail="Batch not found")
+    # Only do cancal when status is "validating" or "in_progress"
+    if batch_response.status in ["validating", "in_progress"]:
+        # Start cancelling the batch asynchronously
+        asyncio.create_task(
+            cancel_batch(
+                tokenizer_manager=tokenizer_manager,
+                batch_id=batch_id,
+                input_file_id=batch_response.input_file_id,
+            )
+        )
+        # Update batch status to "cancelling"
+        batch_response.status = "cancelling"
+        return batch_response
+    else:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Current status is {batch_response.status}, no need to cancel",
+        )
+async def cancel_batch(tokenizer_manager, batch_id: str, input_file_id: str):
+    try:
+        # Update the batch status to "cancelling"
+        batch_storage[batch_id].status = "cancelling"
+        # Retrieve the input file content
+        input_file_request = file_id_request.get(input_file_id)
+        if not input_file_request:
+            raise ValueError("Input file not found")
+        # Parse the JSONL file and process each request
+        input_file_path = file_id_storage.get(input_file_id)
+        with open(input_file_path, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        file_request_list = []
+        request_ids = []
+        for line in lines:
+            request_data = json.loads(line)
+            file_request_list.append(request_data)
+            request_ids.append(request_data["custom_id"])
+        # Cancel requests by request_ids
+        for rid in request_ids:
+            tokenizer_manager.abort_request(rid=rid)
+        retrieve_batch = batch_storage[batch_id]
+        retrieve_batch.status = "cancelled"
+    except Exception as e:
+        logger.error("error in SGLang:", e)
+        # Update batch status to "failed"
+        retrieve_batch = batch_storage[batch_id]
+        retrieve_batch.status = "failed"
+        retrieve_batch.failed_at = int(time.time())
+        retrieve_batch.errors = {"message": str(e)}
 async def v1_retrieve_file(file_id: str):
     # Retrieve the batch job from the in-memory storage
     file_response = file_id_response.get(file_id)
@@ -392,7 +465,9 @@ async def v1_retrieve_file_content(file_id: str):
     return StreamingResponse(iter_file(), media_type="application/octet-stream")
-def v1_generate_request(all_requests: List[CompletionRequest]):
+def v1_generate_request(
+    all_requests: List[CompletionRequest], request_ids: List[str] = None
+):
     prompts = []
     sampling_params_list = []
     return_logprobs = []
@@ -434,6 +509,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]):
                 "frequency_penalty": request.frequency_penalty,
                 "repetition_penalty": request.repetition_penalty,
                 "regex": request.regex,
+                "json_schema": request.json_schema,
                 "n": request.n,
                 "ignore_eos": request.ignore_eos,
             }
@@ -463,6 +539,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]):
         logprob_start_len=logprob_start_lens,
         return_text_in_logprobs=True,
         stream=all_requests[0].stream,
+        rid=request_ids,
     )
     if len(all_requests) == 1:
@@ -745,7 +822,9 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
 def v1_chat_generate_request(
-    all_requests: List[ChatCompletionRequest], tokenizer_manager
+    all_requests: List[ChatCompletionRequest],
+    tokenizer_manager,
+    request_ids: List[str] = None,
 ):
     input_ids = []
     sampling_params_list = []
@@ -802,6 +881,7 @@ def v1_chat_generate_request(
                 "frequency_penalty": request.frequency_penalty,
                 "repetition_penalty": request.repetition_penalty,
                 "regex": request.regex,
+                "json_schema": request.json_schema,
                 "n": request.n,
             }
         )
@@ -832,6 +912,7 @@ def v1_chat_generate_request(
         top_logprobs_num=top_logprobs_nums,
         stream=all_requests[0].stream,
         return_text_in_logprobs=True,
+        rid=request_ids,
     )
     if len(all_requests) == 1:
         return adapted_request, all_requests[0]

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -161,6 +161,7 @@ class CompletionRequest(BaseModel):
     # Extra parameters for SRT backend only and will be ignored by OpenAI models.
     regex: Optional[str] = None
+    json_schema: Optional[str] = None
     ignore_eos: Optional[bool] = False
     min_tokens: Optional[int] = 0
     repetition_penalty: Optional[float] = 1.0
@@ -262,6 +263,7 @@ class ChatCompletionRequest(BaseModel):
     # Extra parameters for SRT backend only and will be ignored by OpenAI models.
     regex: Optional[str] = None
+    json_schema: Optional[str] = None
     min_tokens: Optional[int] = 0
     repetition_penalty: Optional[float] = 1.0
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -21,63 +21,10 @@ class SamplingBatchInfo:
     top_ps: torch.Tensor = None
     top_ks: torch.Tensor = None
     min_ps: torch.Tensor = None
-    # Dispatch in CUDA graph
-    need_min_p_sampling: bool = False
-    # Bias Tensors
+    penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
     logit_bias: torch.Tensor = None
     vocab_mask: torch.Tensor = None
-    # Penalizer
-    penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
-    linear_penalties: torch.Tensor = None
-    scaling_penalties: torch.Tensor = None
-    def has_bias(self):
-        return (
-            self.logit_bias is not None
-            or self.vocab_mask is not None
-            or self.linear_penalties is not None
-            or self.scaling_penalties is not None
-        )
-    @classmethod
-    def dummy_one(cls, max_bs: int, vocab_size: int):
-        ret = cls(vocab_size=vocab_size)
-        ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
-        ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
-        ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
-        ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda")
-        return ret
-    def __getitem__(self, key):
-        if isinstance(key, slice):
-            # NOTE: We do not use cuda graph when there is bias tensors
-            assert not self.has_bias()
-            return SamplingBatchInfo(
-                vocab_size=self.vocab_size,
-                temperatures=self.temperatures[key],
-                top_ps=self.top_ps[key],
-                top_ks=self.top_ks[key],
-                min_ps=self.min_ps[key],
-                need_min_p_sampling=self.need_min_p_sampling,
-            )
-        else:
-            raise NotImplementedError
-    def inplace_assign(self, bs: int, other: SamplingBatchInfo):
-        # NOTE: We do not use cuda graph when there is bias tensors
-        assert not self.has_bias()
-        self.vocab_size = other.vocab_size
-        self.need_min_p_sampling = other.need_min_p_sampling
-        self.temperatures[:bs] = other.temperatures
-        self.top_ps[:bs] = other.top_ps
-        self.top_ks[:bs] = other.top_ks
-        self.min_ps[:bs] = other.min_ps
     @classmethod
     def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
         device = "cuda"
@@ -98,7 +45,6 @@ class SamplingBatchInfo:
         ret.min_ps = torch.tensor(
             [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
         )
-        ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs)
         # Each penalizers will do nothing if they evaluate themselves as not required by looking at
         # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
@@ -126,25 +72,6 @@ class SamplingBatchInfo:
         return ret
-    def prepare_penalties(self):
-        self.scaling_penalties = None
-        self.linear_penalties = None
-        for penalizer in self.penalizer_orchestrator.penalizers.values():
-            if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer):
-                if penalizer.is_prepared():
-                    self.scaling_penalties = penalizer.cumulated_repetition_penalties
-            else:
-                if penalizer.is_prepared():
-                    if self.linear_penalties is None:
-                        bs = self.penalizer_orchestrator.batch.batch_size()
-                        self.linear_penalties = torch.zeros(
-                            (bs, self.vocab_size),
-                            dtype=torch.float32,
-                            device="cuda",
-                        )
-                    self.linear_penalties = penalizer.apply(self.linear_penalties)
     def update_regex_vocab_mask(self, batch: ScheduleBatch):
         bs, reqs = batch.batch_size(), batch.reqs
         device = "cuda"

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -39,6 +39,7 @@ class SamplingParams:
         spaces_between_special_tokens: bool = True,
         regex: Optional[str] = None,
         n: int = 1,
+        json_schema: Optional[str] = None,
     ) -> None:
         self.temperature = temperature
         self.top_p = top_p
@@ -56,6 +57,7 @@ class SamplingParams:
         self.spaces_between_special_tokens = spaces_between_special_tokens
         self.regex = regex
         self.n = n
+        self.json_schema = json_schema
         # Process some special cases
         if self.temperature < _SAMPLING_EPS:
@@ -106,6 +108,8 @@ class SamplingParams:
                     f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
                     f"{self.min_new_tokens}."
                 )
+        if self.regex is not None and self.json_schema is not None:
+            raise ValueError("regex and json_schema cannot be both set.")
     def normalize(self, tokenizer):
         # Process stop strings

sglang/srt/server.py CHANGED Viewed

@@ -59,6 +59,7 @@ from sglang.srt.managers.tokenizer_manager import TokenizerManager
 from sglang.srt.openai_api.adapter import (
     load_chat_template_for_openai_api,
     v1_batches,
+    v1_cancel_batch,
     v1_chat_completions,
     v1_completions,
     v1_delete_file,
@@ -246,6 +247,12 @@ async def openai_v1_batches(raw_request: Request):
     return await v1_batches(tokenizer_manager, raw_request)
+@app.post("/v1/batches/{batch_id}/cancel")
+async def cancel_batches(batch_id: str):
+    # https://platform.openai.com/docs/api-reference/batch/cancel
+    return await v1_cancel_batch(tokenizer_manager, batch_id)
 @app.get("/v1/batches/{batch_id}")
 async def retrieve_batch(batch_id: str):
     return await v1_retrieve_batch(batch_id)
@@ -414,7 +421,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if not server_args.disable_flashinfer:
         assert_pkg_version(
             "flashinfer",
-            "0.1.5",
+            "0.1.6",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",

sglang/test/runners.py CHANGED Viewed

@@ -180,7 +180,7 @@ class SRTRunner:
             tp_size=tp_size,
             dtype=get_dtype_str(torch_dtype),
             port=port,
-            mem_fraction_static=0.69,
+            mem_fraction_static=0.7,
             trust_remote_code=False,
             is_embedding=not self.is_generation,
         )

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.14"
1	+ __version__ = "0.2.14.post1"

{sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.2.14
+Version: 0.2.14.post1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License: Apache License
                                    Version 2.0, January 2004
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
+git clone -b v0.2.14.post1 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -339,6 +339,7 @@ docker run --gpus all \
 ### Method 4: Using docker compose
 <details>
+<summary>More</summary>
 > This method is recommended if you plan to serve it as a service.
 > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
@@ -350,6 +351,7 @@ docker run --gpus all \
 ### Method 5: Run on Kubernetes or Clouds with SkyPilot
 <details>
+<summary>More</summary>
 To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
@@ -389,7 +391,7 @@ sky status --endpoint 30000 sglang
 ### Common Notes
-- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
+- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
 - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
 ## Backend: SGLang Runtime (SRT)
@@ -518,6 +520,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
 #### Use Models From ModelScope
 <details>
+<summary>More</summary>
 To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
 ```
@@ -532,6 +535,7 @@ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen
 #### Run Llama 3.1 405B
 <details>
+<summary>More</summary>
 ```bash
 # Run 405B (fp8) on a single node
@@ -549,7 +553,9 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
 ### Benchmark Performance
-- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
+- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
+  Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle.
+  A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, please use `sglang.bench_serving` instead.
   ```
   python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
   ```

sglang 0.2.14__py3-none-any.whl → 0.2.14.post1__py3-none-any.whl

sglang 0.2.14py3-none-any.whl → 0.2.14.post1py3-none-any.whl