PyPI - sglang - Versions diffs - 0.2.13__py3-none-any.whl → 0.2.14.post1__py3-none-any.whl - Mend

sglang 0.2.13py3-none-any.whl → 0.2.14.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

sglang/api.py +6 -0
sglang/bench_latency.py +7 -3
sglang/bench_serving.py +50 -26
sglang/check_env.py +15 -0
sglang/lang/chat_template.py +10 -5
sglang/lang/compiler.py +4 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +9 -0
sglang/launch_server.py +8 -1
sglang/srt/constrained/fsm_cache.py +11 -2
sglang/srt/constrained/jump_forward.py +1 -0
sglang/srt/conversation.py +50 -1
sglang/srt/hf_transformers_utils.py +22 -23
sglang/srt/layers/activation.py +100 -1
sglang/srt/layers/decode_attention.py +338 -50
sglang/srt/layers/fused_moe/layer.py +2 -2
sglang/srt/layers/logits_processor.py +56 -19
sglang/srt/layers/radix_attention.py +3 -4
sglang/srt/layers/sampler.py +101 -0
sglang/srt/managers/controller_multi.py +2 -8
sglang/srt/managers/controller_single.py +7 -10
sglang/srt/managers/detokenizer_manager.py +20 -9
sglang/srt/managers/io_struct.py +44 -11
sglang/srt/managers/policy_scheduler.py +5 -2
sglang/srt/managers/schedule_batch.py +46 -166
sglang/srt/managers/tokenizer_manager.py +192 -83
sglang/srt/managers/tp_worker.py +118 -24
sglang/srt/mem_cache/memory_pool.py +82 -8
sglang/srt/mm_utils.py +79 -7
sglang/srt/model_executor/cuda_graph_runner.py +32 -8
sglang/srt/model_executor/forward_batch_info.py +51 -26
sglang/srt/model_executor/model_runner.py +201 -58
sglang/srt/models/gemma2.py +10 -6
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +11 -1
sglang/srt/models/llama_embedding.py +4 -0
sglang/srt/models/llava.py +176 -59
sglang/srt/models/qwen2.py +9 -3
sglang/srt/openai_api/adapter.py +200 -39
sglang/srt/openai_api/protocol.py +2 -0
sglang/srt/sampling/sampling_batch_info.py +136 -0
sglang/srt/{sampling_params.py → sampling/sampling_params.py} +22 -0
sglang/srt/server.py +92 -57
sglang/srt/server_args.py +43 -15
sglang/srt/utils.py +26 -16
sglang/test/runners.py +22 -30
sglang/test/simple_eval_common.py +9 -10
sglang/test/simple_eval_gpqa.py +2 -1
sglang/test/simple_eval_humaneval.py +2 -2
sglang/test/simple_eval_math.py +2 -1
sglang/test/simple_eval_mmlu.py +2 -1
sglang/test/test_activation.py +55 -0
sglang/test/test_utils.py +36 -53
sglang/version.py +1 -1
{sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/METADATA +100 -27
sglang-0.2.14.post1.dist-info/RECORD +114 -0
{sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/WHEEL +1 -1
sglang/launch_server_llavavid.py +0 -29
sglang-0.2.13.dist-info/RECORD +0 -112
{sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/LICENSE +0 -0
{sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/top_level.txt +0 -0

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -17,6 +17,7 @@ limitations under the License.
 import asyncio
 import json
+import logging
 import os
 import time
 import uuid
@@ -64,6 +65,8 @@ from sglang.srt.openai_api.protocol import (
     UsageInfo,
 )
+logger = logging.getLogger(__name__)
 chat_template_name = None
@@ -120,7 +123,7 @@ def create_streaming_error_response(
 def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg):
     global chat_template_name
-    print(f"Use chat template: {chat_template_arg}")
+    logger.info(f"Use chat template: {chat_template_arg}")
     if not chat_template_exists(chat_template_arg):
         if not os.path.exists(chat_template_arg):
             raise RuntimeError(
@@ -272,20 +275,32 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
         end_point = batch_storage[batch_id].endpoint
         file_request_list = []
         all_requests = []
+        request_ids = []
         for line in lines:
             request_data = json.loads(line)
             file_request_list.append(request_data)
             body = request_data["body"]
+            request_ids.append(request_data["custom_id"])
+            # Although streaming is supported for standalone completions, it is not supported in
+            # batch mode (multiple completions in single request).
+            if body.get("stream", False):
+                raise ValueError("Streaming requests are not supported in batch mode")
             if end_point == "/v1/chat/completions":
                 all_requests.append(ChatCompletionRequest(**body))
             elif end_point == "/v1/completions":
                 all_requests.append(CompletionRequest(**body))
         if end_point == "/v1/chat/completions":
             adapted_request, request = v1_chat_generate_request(
-                all_requests, tokenizer_manager
+                all_requests, tokenizer_manager, request_ids=request_ids
             )
         elif end_point == "/v1/completions":
-            adapted_request, request = v1_generate_request(all_requests)
+            adapted_request, request = v1_generate_request(
+                all_requests, request_ids=request_ids
+            )
         try:
             ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
             if not isinstance(ret, list):
@@ -317,6 +332,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
             }
             all_ret.append(response_json)
             completed_requests += 1
         # Write results to a new file
         output_file_id = f"backend_result_file-{uuid.uuid4()}"
         global storage_dir
@@ -346,7 +362,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
         }
     except Exception as e:
-        print("error in SGLang:", e)
+        logger.error("error in SGLang:", e)
         # Update batch status to "failed"
         retrieve_batch = batch_storage[batch_id]
         retrieve_batch.status = "failed"
@@ -363,6 +379,72 @@ async def v1_retrieve_batch(batch_id: str):
     return batch_response
+async def v1_cancel_batch(tokenizer_manager, batch_id: str):
+    # Retrieve the batch job from the in-memory storage
+    batch_response = batch_storage.get(batch_id)
+    if batch_response is None:
+        raise HTTPException(status_code=404, detail="Batch not found")
+    # Only do cancal when status is "validating" or "in_progress"
+    if batch_response.status in ["validating", "in_progress"]:
+        # Start cancelling the batch asynchronously
+        asyncio.create_task(
+            cancel_batch(
+                tokenizer_manager=tokenizer_manager,
+                batch_id=batch_id,
+                input_file_id=batch_response.input_file_id,
+            )
+        )
+        # Update batch status to "cancelling"
+        batch_response.status = "cancelling"
+        return batch_response
+    else:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Current status is {batch_response.status}, no need to cancel",
+        )
+async def cancel_batch(tokenizer_manager, batch_id: str, input_file_id: str):
+    try:
+        # Update the batch status to "cancelling"
+        batch_storage[batch_id].status = "cancelling"
+        # Retrieve the input file content
+        input_file_request = file_id_request.get(input_file_id)
+        if not input_file_request:
+            raise ValueError("Input file not found")
+        # Parse the JSONL file and process each request
+        input_file_path = file_id_storage.get(input_file_id)
+        with open(input_file_path, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        file_request_list = []
+        request_ids = []
+        for line in lines:
+            request_data = json.loads(line)
+            file_request_list.append(request_data)
+            request_ids.append(request_data["custom_id"])
+        # Cancel requests by request_ids
+        for rid in request_ids:
+            tokenizer_manager.abort_request(rid=rid)
+        retrieve_batch = batch_storage[batch_id]
+        retrieve_batch.status = "cancelled"
+    except Exception as e:
+        logger.error("error in SGLang:", e)
+        # Update batch status to "failed"
+        retrieve_batch = batch_storage[batch_id]
+        retrieve_batch.status = "failed"
+        retrieve_batch.failed_at = int(time.time())
+        retrieve_batch.errors = {"message": str(e)}
 async def v1_retrieve_file(file_id: str):
     # Retrieve the batch job from the in-memory storage
     file_response = file_id_response.get(file_id)
@@ -383,20 +465,35 @@ async def v1_retrieve_file_content(file_id: str):
     return StreamingResponse(iter_file(), media_type="application/octet-stream")
-def v1_generate_request(all_requests):
+def v1_generate_request(
+    all_requests: List[CompletionRequest], request_ids: List[str] = None
+):
     prompts = []
     sampling_params_list = []
     return_logprobs = []
+    logprob_start_lens = []
     top_logprobs_nums = []
-    first_prompt_type = type(all_requests[0].prompt)
+    # NOTE: with openai API, the prompt's logprobs are always not computed
+    first_prompt_type = type(all_requests[0].prompt)
     for request in all_requests:
-        prompt = request.prompt
         assert (
-            type(prompt) == first_prompt_type
+            type(request.prompt) == first_prompt_type
         ), "All prompts must be of the same type in file input settings"
-        prompts.append(prompt)
+        if len(all_requests) > 1 and request.n > 1:
+            raise ValueError(
+                "Parallel sampling is not supported for completions from files"
+            )
+        if request.echo and request.logprobs:
+            logger.warning(
+                "Echo is not compatible with logprobs. "
+                "To compute logprobs of input prompt, please use SGLang /request API."
+            )
+    for request in all_requests:
+        prompts.append(request.prompt)
         return_logprobs.append(request.logprobs is not None and request.logprobs > 0)
+        logprob_start_lens.append(-1)
         top_logprobs_nums.append(
             request.logprobs if request.logprobs is not None else 0
         )
@@ -412,18 +509,16 @@ def v1_generate_request(all_requests):
                 "frequency_penalty": request.frequency_penalty,
                 "repetition_penalty": request.repetition_penalty,
                 "regex": request.regex,
+                "json_schema": request.json_schema,
                 "n": request.n,
                 "ignore_eos": request.ignore_eos,
             }
         )
-        if len(all_requests) > 1 and request.n > 1:
-            raise ValueError(
-                "Parallel sampling is not supported for completions from files"
-            )
     if len(all_requests) == 1:
         prompt = prompts[0]
         sampling_params_list = sampling_params_list[0]
+        logprob_start_lens = logprob_start_lens[0]
         return_logprobs = return_logprobs[0]
         top_logprobs_nums = top_logprobs_nums[0]
         if isinstance(prompt, str) or isinstance(prompt[0], str):
@@ -441,8 +536,10 @@ def v1_generate_request(all_requests):
         sampling_params=sampling_params_list,
         return_logprob=return_logprobs,
         top_logprobs_num=top_logprobs_nums,
+        logprob_start_len=logprob_start_lens,
         return_text_in_logprobs=True,
         stream=all_requests[0].stream,
+        rid=request_ids,
     )
     if len(all_requests) == 1:
@@ -580,27 +677,45 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
     if adapted_request.stream:
         async def generate_stream_resp():
-            stream_buffer = ""
-            n_prev_token = 0
+            stream_buffers = {}
+            n_prev_tokens = {}
+            prompt_tokens = {}
+            completion_tokens = {}
             try:
                 async for content in tokenizer_manager.generate_request(
                     adapted_request, raw_request
                 ):
+                    index = content["index"]
+                    stream_buffer = stream_buffers.get(index, "")
+                    n_prev_token = n_prev_tokens.get(index, 0)
                     text = content["text"]
-                    prompt_tokens = content["meta_info"]["prompt_tokens"]
-                    completion_tokens = content["meta_info"]["completion_tokens"]
+                    prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
+                    completion_tokens[index] = content["meta_info"]["completion_tokens"]
                     if not stream_buffer:  # The first chunk
                         if request.echo:
                             if isinstance(request.prompt, str):
                                 # for the case of single str prompts
                                 prompts = request.prompt
-                            elif isinstance(request.prompt, list) and isinstance(
-                                request.prompt[0], int
-                            ):
-                                prompts = tokenizer_manager.tokenizer.decode(
-                                    request.prompt, skip_special_tokens=True
-                                )
+                            elif isinstance(request.prompt, list):
+                                if isinstance(request.prompt[0], str):
+                                    # for the case of multiple str prompts
+                                    prompts = request.prompt[index // request.n]
+                                elif isinstance(request.prompt[0], int):
+                                    # for the case of single token ids prompt
+                                    prompts = tokenizer_manager.tokenizer.decode(
+                                        request.prompt, skip_special_tokens=True
+                                    )
+                                elif isinstance(request.prompt[0], list) and isinstance(
+                                    request.prompt[0][0], int
+                                ):
+                                    # for the case of multiple token ids prompts
+                                    prompts = tokenizer_manager.tokenizer.decode(
+                                        request.prompt[index // request.n],
+                                        skip_special_tokens=True,
+                                    )
                             # Prepend prompt in response text.
                             text = prompts + text
@@ -637,7 +752,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
                     delta = text[len(stream_buffer) :]
                     stream_buffer = stream_buffer + delta
                     choice_data = CompletionResponseStreamChoice(
-                        index=0,
+                        index=index,
                         text=delta,
                         logprobs=logprobs,
                         finish_reason=format_finish_reason(
@@ -650,12 +765,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
                         choices=[choice_data],
                         model=request.model,
                     )
+                    stream_buffers[index] = stream_buffer
+                    n_prev_tokens[index] = n_prev_token
                     yield f"data: {chunk.model_dump_json()}\n\n"
                 if request.stream_options and request.stream_options.include_usage:
+                    total_prompt_tokens = sum(
+                        tokens
+                        for i, tokens in prompt_tokens.items()
+                        if i % request.n == 0
+                    )
+                    total_completion_tokens = sum(
+                        tokens for tokens in completion_tokens.values()
+                    )
                     usage = UsageInfo(
-                        prompt_tokens=prompt_tokens,
-                        completion_tokens=completion_tokens,
-                        total_tokens=prompt_tokens + completion_tokens,
+                        prompt_tokens=total_prompt_tokens,
+                        completion_tokens=total_completion_tokens,
+                        total_tokens=total_prompt_tokens + total_completion_tokens,
                     )
                     final_usage_chunk = CompletionStreamResponse(
@@ -694,12 +821,20 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
     return response
-def v1_chat_generate_request(all_requests, tokenizer_manager):
+def v1_chat_generate_request(
+    all_requests: List[ChatCompletionRequest],
+    tokenizer_manager,
+    request_ids: List[str] = None,
+):
     input_ids = []
     sampling_params_list = []
     image_data_list = []
     return_logprobs = []
+    logprob_start_lens = []
     top_logprobs_nums = []
+    # NOTE: with openai API, the prompt's logprobs are always not computed
     for request in all_requests:
         # Prep the data needed for the underlying GenerateReqInput:
         #  - prompt: The full prompt string.
@@ -732,6 +867,7 @@ def v1_chat_generate_request(all_requests, tokenizer_manager):
             image_data = None
         input_ids.append(prompt_ids)
         return_logprobs.append(request.logprobs)
+        logprob_start_lens.append(-1)
         top_logprobs_nums.append(request.top_logprobs)
         sampling_params_list.append(
             {
@@ -745,6 +881,7 @@ def v1_chat_generate_request(all_requests, tokenizer_manager):
                 "frequency_penalty": request.frequency_penalty,
                 "repetition_penalty": request.repetition_penalty,
                 "regex": request.regex,
+                "json_schema": request.json_schema,
                 "n": request.n,
             }
         )
@@ -758,20 +895,24 @@ def v1_chat_generate_request(all_requests, tokenizer_manager):
         sampling_params_list = sampling_params_list[0]
         image_data = image_data_list[0]
         return_logprobs = return_logprobs[0]
+        logprob_start_lens = logprob_start_lens[0]
         top_logprobs_nums = top_logprobs_nums[0]
     else:
         if isinstance(input_ids[0], str):
             prompt_kwargs = {"text": input_ids}
         else:
             prompt_kwargs = {"input_ids": input_ids}
     adapted_request = GenerateReqInput(
         **prompt_kwargs,
         image_data=image_data,
         sampling_params=sampling_params_list,
         return_logprob=return_logprobs,
+        logprob_start_len=logprob_start_lens,
         top_logprobs_num=top_logprobs_nums,
         stream=all_requests[0].stream,
         return_text_in_logprobs=True,
+        rid=request_ids,
     )
     if len(all_requests) == 1:
         return adapted_request, all_requests[0]
@@ -892,16 +1033,23 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
     if adapted_request.stream:
         async def generate_stream_resp():
-            is_first = True
-            stream_buffer = ""
-            n_prev_token = 0
+            is_firsts = {}
+            stream_buffers = {}
+            n_prev_tokens = {}
+            prompt_tokens = {}
+            completion_tokens = {}
             try:
                 async for content in tokenizer_manager.generate_request(
                     adapted_request, raw_request
                 ):
-                    prompt_tokens = content["meta_info"]["prompt_tokens"]
-                    completion_tokens = content["meta_info"]["completion_tokens"]
+                    index = content["index"]
+                    is_first = is_firsts.get(index, True)
+                    stream_buffer = stream_buffers.get(index, "")
+                    n_prev_token = n_prev_tokens.get(index, 0)
+                    prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
+                    completion_tokens[index] = content["meta_info"]["completion_tokens"]
                     if request.logprobs:
                         logprobs = to_openai_style_logprobs(
                             output_token_logprobs=content["meta_info"][
@@ -951,7 +1099,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
                         # First chunk with role
                         is_first = False
                         choice_data = ChatCompletionResponseStreamChoice(
-                            index=0,
+                            index=index,
                             delta=DeltaMessage(role="assistant"),
                             finish_reason=format_finish_reason(
                                 content["meta_info"]["finish_reason"]
@@ -969,7 +1117,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
                     delta = text[len(stream_buffer) :]
                     stream_buffer = stream_buffer + delta
                     choice_data = ChatCompletionResponseStreamChoice(
-                        index=0,
+                        index=index,
                         delta=DeltaMessage(content=delta),
                         finish_reason=format_finish_reason(
                             content["meta_info"]["finish_reason"]
@@ -981,12 +1129,25 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
                         choices=[choice_data],
                         model=request.model,
                     )
+                    is_firsts[index] = is_first
+                    stream_buffers[index] = stream_buffer
+                    n_prev_tokens[index] = n_prev_token
                     yield f"data: {chunk.model_dump_json()}\n\n"
                 if request.stream_options and request.stream_options.include_usage:
+                    total_prompt_tokens = sum(
+                        tokens
+                        for i, tokens in prompt_tokens.items()
+                        if i % request.n == 0
+                    )
+                    total_completion_tokens = sum(
+                        tokens for tokens in completion_tokens.values()
+                    )
                     usage = UsageInfo(
-                        prompt_tokens=prompt_tokens,
-                        completion_tokens=completion_tokens,
-                        total_tokens=prompt_tokens + completion_tokens,
+                        prompt_tokens=total_prompt_tokens,
+                        completion_tokens=total_completion_tokens,
+                        total_tokens=total_prompt_tokens + total_completion_tokens,
                     )
                     final_usage_chunk = ChatCompletionStreamResponse(

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -161,6 +161,7 @@ class CompletionRequest(BaseModel):
     # Extra parameters for SRT backend only and will be ignored by OpenAI models.
     regex: Optional[str] = None
+    json_schema: Optional[str] = None
     ignore_eos: Optional[bool] = False
     min_tokens: Optional[int] = 0
     repetition_penalty: Optional[float] = 1.0
@@ -262,6 +263,7 @@ class ChatCompletionRequest(BaseModel):
     # Extra parameters for SRT backend only and will be ignored by OpenAI models.
     regex: Optional[str] = None
+    json_schema: Optional[str] = None
     min_tokens: Optional[int] = 0
     repetition_penalty: Optional[float] = 1.0
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)

sglang/srt/sampling/sampling_batch_info.py ADDED Viewed

@@ -0,0 +1,136 @@
+from __future__ import annotations
+import dataclasses
+from typing import TYPE_CHECKING, List
+import torch
+import sglang.srt.sampling.penaltylib as penaltylib
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import ScheduleBatch
+@dataclasses.dataclass
+class SamplingBatchInfo:
+    # Basic Info
+    vocab_size: int
+    # Batched sampling params
+    temperatures: torch.Tensor = None
+    top_ps: torch.Tensor = None
+    top_ks: torch.Tensor = None
+    min_ps: torch.Tensor = None
+    penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
+    logit_bias: torch.Tensor = None
+    vocab_mask: torch.Tensor = None
+    @classmethod
+    def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
+        device = "cuda"
+        reqs = batch.reqs
+        ret = cls(vocab_size=vocab_size)
+        ret.temperatures = torch.tensor(
+            [r.sampling_params.temperature for r in reqs],
+            dtype=torch.float,
+            device=device,
+        ).view(-1, 1)
+        ret.top_ps = torch.tensor(
+            [r.sampling_params.top_p for r in reqs], dtype=torch.float, device=device
+        )
+        ret.top_ks = torch.tensor(
+            [r.sampling_params.top_k for r in reqs], dtype=torch.int, device=device
+        )
+        ret.min_ps = torch.tensor(
+            [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
+        )
+        # Each penalizers will do nothing if they evaluate themselves as not required by looking at
+        # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
+        # should not add hefty computation overhead other than simple checks.
+        #
+        # While we choose not to even create the class instances if they are not required, this
+        # could add additional complexity to the {ScheduleBatch} class, especially we need to
+        # handle {filter_batch()} and {merge()} cases as well.
+        ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
+            vocab_size=vocab_size,
+            batch=batch,
+            device=device,
+            Penalizers={
+                penaltylib.BatchedFrequencyPenalizer,
+                penaltylib.BatchedMinNewTokensPenalizer,
+                penaltylib.BatchedPresencePenalizer,
+                penaltylib.BatchedRepetitionPenalizer,
+            },
+        )
+        # Handle logit bias but only allocate when needed
+        ret.logit_bias = None
+        ret.update_regex_vocab_mask(batch)
+        return ret
+    def update_regex_vocab_mask(self, batch: ScheduleBatch):
+        bs, reqs = batch.batch_size(), batch.reqs
+        device = "cuda"
+        has_regex = any(req.regex_fsm is not None for req in reqs)
+        # Reset the vocab mask
+        self.vocab_mask = None
+        if has_regex:
+            for i, req in enumerate(reqs):
+                if req.regex_fsm is not None:
+                    if self.vocab_mask is None:
+                        self.vocab_mask = torch.zeros(
+                            bs, self.vocab_size, dtype=torch.bool, device=device
+                        )
+                    self.vocab_mask[i][
+                        req.regex_fsm.get_next_instruction(req.regex_fsm_state).tokens
+                    ] = 1
+    def filter(self, unfinished_indices: List[int], new_indices: torch.Tensor):
+        self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
+        for item in [
+            "temperatures",
+            "top_ps",
+            "top_ks",
+            "min_ps",
+            "logit_bias",
+        ]:
+            self_val = getattr(self, item, None)
+            if self_val is not None:  # logit_bias can be None
+                setattr(self, item, self_val[new_indices])
+    def merge(self, other: "SamplingBatchInfo"):
+        self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
+        for item in [
+            "temperatures",
+            "top_ps",
+            "top_ks",
+            "min_ps",
+        ]:
+            self_val = getattr(self, item, None)
+            other_val = getattr(other, item, None)
+            setattr(self, item, torch.concat([self_val, other_val]))
+        # logit_bias can be None
+        if self.logit_bias is not None or other.logit_bias is not None:
+            vocab_size = (
+                self.logit_bias.shape[1]
+                if self.logit_bias is not None
+                else other.logit_bias.shape[1]
+            )
+            if self.logit_bias is None:
+                self.logit_bias = torch.zeros(
+                    (len(self.reqs), vocab_size), dtype=torch.float32, device="cuda"
+                )
+            if other.logit_bias is None:
+                other.logit_bias = torch.zeros(
+                    (len(other.reqs), vocab_size), dtype=torch.float32, device="cuda"
+                )
+            self.logit_bias = torch.concat([self.logit_bias, other.logit_bias])

sglang/srt/{sampling_params.py → sampling/sampling_params.py} RENAMED Viewed

@@ -30,6 +30,7 @@ class SamplingParams:
         temperature: float = 1.0,
         top_p: float = 1.0,
         top_k: int = -1,
+        min_p: float = 0.0,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repetition_penalty: float = 1.0,
@@ -38,10 +39,12 @@ class SamplingParams:
         spaces_between_special_tokens: bool = True,
         regex: Optional[str] = None,
         n: int = 1,
+        json_schema: Optional[str] = None,
     ) -> None:
         self.temperature = temperature
         self.top_p = top_p
         self.top_k = top_k
+        self.min_p = min_p
         self.frequency_penalty = frequency_penalty
         self.presence_penalty = presence_penalty
         self.repetition_penalty = repetition_penalty
@@ -54,6 +57,7 @@ class SamplingParams:
         self.spaces_between_special_tokens = spaces_between_special_tokens
         self.regex = regex
         self.n = n
+        self.json_schema = json_schema
         # Process some special cases
         if self.temperature < _SAMPLING_EPS:
@@ -69,6 +73,8 @@ class SamplingParams:
             )
         if not 0.0 < self.top_p <= 1.0:
             raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
+        if not 0.0 <= self.min_p <= 1.0:
+            raise ValueError(f"min_p must be in [0, 1], got {self.min_p}.")
         if self.top_k < -1 or self.top_k == 0:
             raise ValueError(
                 f"top_k must be -1 (disable), or at least 1, " f"got {self.top_k}."
@@ -102,6 +108,8 @@ class SamplingParams:
                     f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
                     f"{self.min_new_tokens}."
                 )
+        if self.regex is not None and self.json_schema is not None:
+            raise ValueError("regex and json_schema cannot be both set.")
     def normalize(self, tokenizer):
         # Process stop strings
@@ -123,3 +131,17 @@ class SamplingParams:
                 else:
                     stop_str_max_len = max(stop_str_max_len, len(stop_str))
             self.stop_str_max_len = stop_str_max_len
+    def to_srt_kwargs(self):
+        return {
+            "max_new_tokens": self.max_new_tokens,
+            "stop": self.stop_strs,
+            "stop_token_ids": list(self.stop_token_ids),
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "top_k": self.top_k,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty,
+            "ignore_eos": self.ignore_eos,
+            "regex": self.regex,
+        }

sglang 0.2.13__py3-none-any.whl → 0.2.14.post1__py3-none-any.whl

sglang 0.2.13py3-none-any.whl → 0.2.14.post1py3-none-any.whl