PyPI - sglang - Versions diffs - 0.3.4.post1__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl - Mend

sglang 0.3.4.post1py3-none-any.whl → 0.3.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

sglang/srt/configs/model_config.py +25 -2
sglang/srt/constrained/fsm_cache.py +10 -3
sglang/srt/hf_transformers_utils.py +14 -0
sglang/srt/layers/attention/flashinfer_backend.py +5 -5
sglang/srt/layers/logits_processor.py +5 -5
sglang/srt/layers/rotary_embedding.py +15 -48
sglang/srt/layers/sampler.py +51 -39
sglang/srt/managers/data_parallel_controller.py +1 -1
sglang/srt/managers/detokenizer_manager.py +4 -0
sglang/srt/managers/io_struct.py +10 -0
sglang/srt/managers/schedule_batch.py +13 -3
sglang/srt/managers/scheduler.py +8 -2
sglang/srt/managers/tokenizer_manager.py +14 -0
sglang/srt/managers/tp_worker_overlap_thread.py +58 -21
sglang/srt/mem_cache/memory_pool.py +10 -3
sglang/srt/model_executor/cuda_graph_runner.py +29 -21
sglang/srt/model_executor/forward_batch_info.py +6 -9
sglang/srt/model_executor/model_runner.py +2 -2
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
sglang/srt/sampling/sampling_params.py +5 -7
sglang/srt/server.py +12 -0
sglang/test/run_eval.py +2 -0
sglang/test/srt/sampling/penaltylib/utils.py +1 -0
sglang/test/test_utils.py +100 -3
sglang/version.py +1 -1
{sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +13 -14
{sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +30 -30
{sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
{sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+import logging
+import os
 from enum import IntEnum, auto
 from typing import Optional
@@ -20,6 +22,8 @@ from transformers import PretrainedConfig
 from sglang.srt.hf_transformers_utils import get_config, get_context_length
+logger = logging.getLogger(__name__)
 class AttentionArch(IntEnum):
     MLA = auto()
@@ -46,10 +50,29 @@ class ModelConfig:
             model_override_args=model_override_args,
         )
         self.hf_text_config = get_hf_text_config(self.hf_config)
+        derived_context_len = get_context_length(self.hf_text_config)
+        allow_long_context = os.environ.get(
+            "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None
+        )
         if context_length is not None:
-            self.context_len = context_length
+            if context_length > derived_context_len:
+                if allow_long_context:
+                    logger.warning(
+                        f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
+                        f"This may lead to incorrect model outputs or CUDA errors."
+                    )
+                    self.context_len = context_length
+                else:
+                    raise ValueError(
+                        f"User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
+                        f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config. "
+                        f"To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
+                    )
+            else:
+                self.context_len = context_length
         else:
-            self.context_len = get_context_length(self.hf_text_config)
+            self.context_len = derived_context_len
         # Unify the config keys for hf_text_config
         self.head_dim = getattr(

sglang/srt/constrained/fsm_cache.py CHANGED Viewed

@@ -73,9 +73,16 @@ class FSMCache(BaseToolCache):
     def init_value(self, key):
         key_type, key_string = key
         if key_type == "json":
-            regex = build_regex_from_schema(
-                key_string, whitespace_pattern=self.constrained_json_whitespace_pattern
-            )
+            try:
+                regex = build_regex_from_schema(
+                    key_string,
+                    whitespace_pattern=self.constrained_json_whitespace_pattern,
+                )
+            except NotImplementedError as e:
+                logger.warning(
+                    f"skip invalid json schema: json_schema={key_string}, {e=}"
+                )
+                return None, key_string
         elif key_type == "regex":
             regex = key_string
         else:

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -163,6 +163,8 @@ def get_tokenizer(
             "Using a slow tokenizer. This might cause a significant "
             "slowdown. Consider using a fast tokenizer instead."
         )
+    attach_additional_stop_token_ids(tokenizer)
     return tokenizer
@@ -181,4 +183,16 @@ def get_processor(
         tokenizer_revision=tokenizer_revision,
         **kwargs,
     )
+    attach_additional_stop_token_ids(processor.tokenizer)
     return processor
+def attach_additional_stop_token_ids(tokenizer):
+    # Special handling for stop token <|eom_id|> generated by llama 3 tool use.
+    if "<|eom_id|>" in tokenizer.get_added_vocab():
+        tokenizer.additional_stop_token_ids = set(
+            [tokenizer.get_added_vocab()["<|eom_id|>"]]
+        )
+    else:
+        tokenizer.additional_stop_token_ids = None

sglang/srt/layers/attention/flashinfer_backend.py CHANGED Viewed

@@ -337,7 +337,7 @@ class FlashInferIndicesUpdaterDecode:
     def update(
         self, req_pool_indices, seq_lens, seq_lens_sum, decode_wrappers, encoder_lens
     ):
-        # Keep the signature for type checking, will be initialized during runtime
+        # Keep the signature for type checking. It will be assigned during runtime.
         raise NotImplementedError()
     def update_single_wrapper(
@@ -432,8 +432,8 @@ class FlashInferIndicesUpdaterDecode:
         kv_start_idx,
     ):
         bs = len(req_pool_indices)
+        kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
         kv_indptr = kv_indptr[: bs + 1]
-        kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0)
         kv_indices = torch.empty(
             paged_kernel_lens_sum, dtype=torch.int32, device="cuda"
         )
@@ -497,7 +497,7 @@ class FlashInferIndicesUpdaterPrefill:
             self.update = self.update_single_wrapper
     def update(self, req_pool_indices, seq_lens, prefix_lens, use_ragged, encoder_lens):
-        # Keep the signature for type checking, will be initialized during runtime
+        # Keep the signature for type checking. It will be assigned during runtime.
         raise NotImplementedError()
     def update_single_wrapper(
@@ -589,8 +589,8 @@ class FlashInferIndicesUpdaterPrefill:
         use_ragged,
     ):
         bs = len(req_pool_indices)
+        kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
         kv_indptr = kv_indptr[: bs + 1]
-        kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0)
         kv_indices = torch.empty(kv_indptr[-1], dtype=torch.int32, device="cuda")
         create_flashinfer_kv_indices_triton[(bs,)](
             self.req_to_token,
@@ -602,8 +602,8 @@ class FlashInferIndicesUpdaterPrefill:
             self.max_context_len,
         )
+        qo_indptr[1 : bs + 1] = torch.cumsum(seq_lens - prefix_lens, dim=0)
         qo_indptr = qo_indptr[: bs + 1]
-        qo_indptr[1:] = torch.cumsum(seq_lens - prefix_lens, dim=0)
         # extend part
         if use_ragged:

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -33,17 +33,17 @@ class LogitsProcessorOutput:
     # The logits of the next tokens.       shape: [#seq, vocab_size]
     next_token_logits: torch.Tensor
     # The logprobs of the next tokens.     shape: [#seq, vocab_size]
-    next_token_logprobs: torch.Tensor
+    next_token_logprobs: torch.Tensor = None
     # The normlaized logprobs of prompts.  shape: [#seq]
-    normalized_prompt_logprobs: torch.Tensor
+    normalized_prompt_logprobs: torch.Tensor = None
     # The logprobs of input tokens.        shape: [#token, vocab_size]
-    input_token_logprobs: torch.Tensor
+    input_token_logprobs: torch.Tensor = None
     # The logprob and id of the top-k tokens in input positions.  shape [#seq, #token, k] of Tuple(logprob, token_id)
-    input_top_logprobs: List
+    input_top_logprobs: List = None
     # The logprob and id of the top-k tokens in output positions. shape [#seq, #token, k] of Tuple(logprob, token_id)
-    output_top_logprobs: List
+    output_top_logprobs: List = None
 @dataclasses.dataclass

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -22,64 +22,33 @@ class MRotaryEmbedding:
     @staticmethod
     def get_input_positions(
-        input_tokens: List[int],
+        input_tokens: torch.Tensor,
         image_grid_thw: Union[List[List[int]], torch.Tensor],
-        video_grid_thw: Union[List[List[int]], torch.Tensor],
-        image_token_id: int,
-        video_token_id: int,
         vision_start_token_id: int,
-        vision_end_token_id: int,
         spatial_merge_size: int,
         context_len: int = 0,
-        extend_prefix_len: int = 0,
     ) -> Tuple[List[List[int]], int]:
         """Get mrope input positions and delta value."""
         if isinstance(image_grid_thw, torch.Tensor):
             image_grid_thw = image_grid_thw.tolist()
-        if isinstance(video_grid_thw, torch.Tensor):
-            video_grid_thw = video_grid_thw.tolist()
-        input_tokens_tensor = torch.tensor(input_tokens)
         vision_start_indices = torch.argwhere(
-            input_tokens_tensor == vision_start_token_id
+            input_tokens == vision_start_token_id
         ).squeeze(1)
-        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
-        image_nums = (vision_tokens == image_token_id).sum()
-        video_nums = (vision_tokens == video_token_id).sum()
+        image_indices = vision_start_indices + 1
+        image_nums = image_indices.shape[0]
         llm_pos_ids_list: list = []
         st = 0
-        remain_images, remain_videos = image_nums, video_nums
-        image_index, video_index = 0, 0
-        for _ in range(image_nums + video_nums):
-            if image_token_id in input_tokens and remain_images > 0:
-                ed_image = input_tokens.index(image_token_id, st)
-            else:
-                ed_image = len(input_tokens) + 1
-            if video_token_id in input_tokens and remain_videos > 0:
-                ed_video = input_tokens.index(video_token_id, st)
-            else:
-                ed_video = len(input_tokens) + 1
-            if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
-                image_index += 1
-                remain_images -= 1
-                ed = ed_image
-            else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
-                video_index += 1
-                remain_videos -= 1
-                ed = ed_video
+        input_tokens_len = input_tokens.shape[0]
+        for image_index in range(image_nums):
+            ed = image_indices[image_index].item()
+            t, h, w = (
+                image_grid_thw[image_index][0],
+                image_grid_thw[image_index][1],
+                image_grid_thw[image_index][2],
+            )
             llm_grid_t, llm_grid_h, llm_grid_w = (
                 t,
                 h // spatial_merge_size,
@@ -115,18 +84,16 @@ class MRotaryEmbedding:
             )
             st = ed + llm_grid_t * llm_grid_h * llm_grid_w
-        if st < len(input_tokens):
+        if st < input_tokens_len:
             st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            text_len = len(input_tokens) - st
+            text_len = input_tokens_len - st
             llm_pos_ids_list.append(
                 torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
             )
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
         llm_positions = llm_positions[:, context_len:]
-        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions += extend_prefix_len
+        mrope_position_delta = (llm_positions.max() + 1 - input_tokens_len).item()
         return llm_positions.tolist(), mrope_position_delta
     @staticmethod

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import os
 from typing import Union
 import torch
@@ -17,6 +18,11 @@ if is_flashinfer_available():
         top_p_renorm_prob,
     )
+# Crash on warning if we are running CI tests
+crash_on_warning = os.getenv("SGLANG_IS_IN_CI", "false") == "true"
 logger = logging.getLogger(__name__)
@@ -33,56 +39,62 @@ class Sampler(nn.Module):
         if isinstance(logits, LogitsProcessorOutput):
             logits = logits.next_token_logits
-        # Post process logits
         logits = logits.contiguous()
-        logits.div_(sampling_info.temperatures)
-        probs = torch.softmax(logits, dim=-1)
-        logits = None
-        del logits
-        if self.use_nan_detectioin and torch.any(torch.isnan(probs)):
-            logger.warning("Detected errors during sampling! NaN in the probability.")
-            probs = torch.where(
-                torch.isnan(probs), torch.full_like(probs, 1e-10), probs
+        if self.use_nan_detectioin and torch.any(torch.isnan(logits)):
+            logger.warning("Detected errors during sampling! NaN in the logits.")
+            logits = torch.where(
+                torch.isnan(logits), torch.full_like(logits, -1e5), logits
             )
+            exit(1) if crash_on_warning else None
         if sampling_info.is_all_greedy:
             # Use torch.argmax if all requests use greedy sampling
-            batch_next_token_ids = torch.argmax(probs, -1)
-        elif global_server_args_dict["sampling_backend"] == "flashinfer":
-            max_top_k_round, batch_size = 32, probs.shape[0]
-            uniform_samples = torch.rand(
-                (max_top_k_round, batch_size), device=probs.device
-            )
-            if sampling_info.need_min_p_sampling:
-                probs = top_k_renorm_prob(probs, sampling_info.top_ks)
-                probs = top_p_renorm_prob(probs, sampling_info.top_ps)
-                batch_next_token_ids, success = min_p_sampling_from_probs(
-                    probs, uniform_samples, sampling_info.min_ps
+            batch_next_token_ids = torch.argmax(logits, -1)
+        else:
+            # Post process logits
+            logits.div_(sampling_info.temperatures)
+            probs = torch.softmax(logits, dim=-1)
+            logits = None
+            del logits
+            if global_server_args_dict["sampling_backend"] == "flashinfer":
+                max_top_k_round, batch_size = 32, probs.shape[0]
+                uniform_samples = torch.rand(
+                    (max_top_k_round, batch_size), device=probs.device
                 )
-            else:
-                batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
+                if sampling_info.need_min_p_sampling:
+                    probs = top_k_renorm_prob(probs, sampling_info.top_ks)
+                    probs = top_p_renorm_prob(probs, sampling_info.top_ps)
+                    batch_next_token_ids, success = min_p_sampling_from_probs(
+                        probs, uniform_samples, sampling_info.min_ps
+                    )
+                else:
+                    batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
+                        probs,
+                        uniform_samples,
+                        sampling_info.top_ks,
+                        sampling_info.top_ps,
+                        filter_apply_order="joint",
+                    )
+                if not torch.all(success):
+                    logger.warning("Detected errors during sampling!")
+                    batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
+            elif global_server_args_dict["sampling_backend"] == "pytorch":
+                # A slower fallback implementation with torch native operations.
+                batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
                     probs,
-                    uniform_samples,
                     sampling_info.top_ks,
                     sampling_info.top_ps,
-                    filter_apply_order="joint",
+                    sampling_info.min_ps,
+                )
+            else:
+                raise ValueError(
+                    f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
                 )
-            if not torch.all(success):
-                logger.warning("Detected errors during sampling!")
-                batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
-        elif global_server_args_dict["sampling_backend"] == "pytorch":
-            # Here we provide a slower fallback implementation.
-            batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
-                probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
-            )
-        else:
-            raise ValueError(
-                f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
-            )
-        return batch_next_token_ids
+        return batch_next_token_ids.to(torch.int32)
 def top_k_top_p_min_p_sampling_from_probs_torch(

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -156,7 +156,7 @@ class DataParallelController:
                 else:
                     # Send other control messages to all workers
                     for worker in self.workers:
-                        worker.queue.put(recv_req)
+                        worker.send_pyobj(recv_req)
 def run_data_parallel_controller_process(

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -27,6 +27,7 @@ from sglang.srt.managers.io_struct import (
     BatchEmbeddingOut,
     BatchStrOut,
     BatchTokenIDOut,
+    GetMemPoolSizeReqOutput,
     UpdateWeightReqOutput,
 )
 from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN
@@ -111,6 +112,9 @@ class DetokenizerManager:
                 # If it is a weight update request, no detokenization is needed.
                 self.send_to_tokenizer.send_pyobj(recv_obj)
                 continue
+            elif isinstance(recv_obj, GetMemPoolSizeReqOutput):
+                self.send_to_tokenizer.send_pyobj(recv_obj)
+                continue
             elif self.tokenizer is None:
                 # If the tokenizer is skipped, no detokenization is needed
                 self.send_to_tokenizer.send_pyobj(recv_obj)

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -353,3 +353,13 @@ class AbortReq:
 class ProfileReq(Enum):
     START_PROFILE = 1
     STOP_PROFILE = 2
+@dataclass
+class GetMemPoolSizeReq:
+    pass
+@dataclass
+class GetMemPoolSizeReqOutput:
+    size: int

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -334,15 +334,20 @@ class Req:
         last_token_id = self.output_ids[-1]
-        matched_eos = last_token_id in self.sampling_params.stop_token_ids
+        matched_eos = False
+        # Check stop token ids
+        if self.sampling_params.stop_token_ids:
+            matched_eos = last_token_id in self.sampling_params.stop_token_ids
         if self.tokenizer is not None:
             matched_eos |= last_token_id == self.tokenizer.eos_token_id
+            if self.tokenizer.additional_stop_token_ids:
+                matched_eos |= last_token_id in self.tokenizer.additional_stop_token_ids
         if matched_eos and not self.sampling_params.ignore_eos:
             self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id)
             return
+        # Check stop strings
         if len(self.sampling_params.stop_strs) > 0:
             tail_str = self.tokenizer.decode(
                 self.output_ids[-(self.sampling_params.stop_str_max_len + 1) :]
@@ -514,7 +519,12 @@ class ScheduleBatch:
                 out_cache_loc = self.token_to_kv_pool.alloc(num_tokens)
             if out_cache_loc is None:
-                logger.error("Prefill out of memory. Try to lower your batch size.")
+                phase_str = "Prefill" if self.forward_mode.is_extend() else "Decode"
+                logger.error(
+                    f"{phase_str} out of memory. Try to lower your batch size.\n"
+                    f"Try to allocate {num_tokens} tokens.\n"
+                    f"Avaliable tokens: {self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()}\n"
+                )
                 if self.tree_cache is not None:
                     self.tree_cache.pretty_print()
                 exit(1)

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -38,6 +38,8 @@ from sglang.srt.managers.io_struct import (
     BatchEmbeddingOut,
     BatchTokenIDOut,
     FlushCacheReq,
+    GetMemPoolSizeReq,
+    GetMemPoolSizeReqOutput,
     ProfileReq,
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
@@ -69,7 +71,6 @@ from sglang.srt.utils import (
     is_generation_model,
     is_multimodal_model,
     kill_parent_process,
-    pytorch_profile,
     set_random_seed,
     suppress_other_loggers,
 )
@@ -363,6 +364,10 @@ class Scheduler:
                     self.start_profile()
                 else:
                     self.stop_profile()
+            elif isinstance(recv_req, GetMemPoolSizeReq):
+                self.send_to_detokenizer.send_pyobj(
+                    GetMemPoolSizeReqOutput(self.max_total_num_tokens)
+                )
             else:
                 raise ValueError(f"Invalid request: {recv_req}")
@@ -416,7 +421,7 @@ class Scheduler:
                 )
         # Truncate prompts that are too long
-        if len(req.origin_input_ids) >= self.max_req_input_len:
+        if len(req.origin_input_ids) > self.max_req_input_len:
             logger.warning(
                 "Request length is longer than the KV cache pool size or "
                 "the max context length. Truncated!!!"
@@ -828,6 +833,7 @@ class Scheduler:
         if self.enable_overlap:
             logits_output, next_token_ids = self.tp_worker.resulve_batch_result(bid)
+            next_token_logprobs = logits_output.next_token_logprobs
         else:
             # Move next_token_ids and logprobs to cpu
             if batch.return_logprob:

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -46,6 +46,8 @@ from sglang.srt.managers.io_struct import (
     EmbeddingReqInput,
     FlushCacheReq,
     GenerateReqInput,
+    GetMemPoolSizeReq,
+    GetMemPoolSizeReqOutput,
     ProfileReq,
     RewardReqInput,
     TokenizedEmbeddingReqInput,
@@ -531,6 +533,15 @@ class TokenizerManager:
         req = ProfileReq.STOP_PROFILE
         self.send_to_scheduler.send_pyobj(req)
+    async def get_memory_pool_size(self):
+        if self.to_create_loop:
+            self.create_handle_loop()
+        req = GetMemPoolSizeReq()
+        self.send_to_scheduler.send_pyobj(req)
+        self.mem_pool_size = asyncio.Future()
+        return await self.mem_pool_size
     async def update_weights(
         self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None
     ):
@@ -590,6 +601,9 @@ class TokenizerManager:
             if isinstance(recv_obj, UpdateWeightReqOutput):
                 self.model_update_result.set_result(recv_obj)
                 continue
+            elif isinstance(recv_obj, GetMemPoolSizeReqOutput):
+                self.mem_pool_size.set_result(recv_obj)
+                continue
             assert isinstance(
                 recv_obj, (BatchStrOut, BatchEmbeddingOut, BatchTokenIDOut)

sglang/srt/managers/tp_worker_overlap_thread.py CHANGED Viewed

@@ -32,6 +32,15 @@ from sglang.srt.server_args import ServerArgs
 logger = logging.getLogger(__name__)
+@torch.compile(dynamic=True)
+def resolve_future_token_ids(input_ids, future_token_ids_map):
+    input_ids[:] = torch.where(
+        input_ids < 0,
+        future_token_ids_map[torch.clamp(-input_ids, min=0)],
+        input_ids,
+    )
 class TpModelWorkerClient:
     """A tensor parallel model worker."""
@@ -94,46 +103,69 @@ class TpModelWorkerClient:
         while True:
             self.has_inflight_batch = False
             model_worker_batch, future_token_ids_ct = self.input_queue.get()
+            if not model_worker_batch:
+                break
             self.has_inflight_batch = True
             self.launch_event = threading.Event()
             # Resolve future tokens in the input
             input_ids = model_worker_batch.input_ids
-            input_ids[:] = torch.where(
-                input_ids < 0,
-                self.future_token_ids_map[torch.clamp(-input_ids, min=0)],
-                input_ids,
-            )
+            resolve_future_token_ids(input_ids, self.future_token_ids_map)
             # Run forward
             logits_output, next_token_ids = self.worker.forward_batch_generation(
                 model_worker_batch
             )
-            self.launch_event.set()
             # Update the future token ids map
             bs = len(model_worker_batch.seq_lens)
-            future_next_token_ids = torch.arange(
-                -(future_token_ids_ct + bs),
-                -(future_token_ids_ct),
-                dtype=torch.int32,
-                device=self.device,
-            )
-            self.future_token_ids_map[-future_next_token_ids] = next_token_ids.to(
-                torch.int32
-            )
+            self.future_token_ids_map[
+                future_token_ids_ct + 1 : future_token_ids_ct + bs + 1
+            ] = next_token_ids
+            # Copy results to the CPU
+            if model_worker_batch.return_logprob:
+                logits_output.next_token_logprobs = logits_output.next_token_logprobs[
+                    torch.arange(len(next_token_ids), device=self.device),
+                    next_token_ids,
+                ].to("cpu", non_blocking=True)
+                if logits_output.input_token_logprobs is not None:
+                    logits_output.input_token_logprobs = (
+                        logits_output.input_token_logprobs.to("cpu", non_blocking=True)
+                    )
+                    logits_output.normalized_prompt_logprobs = (
+                        logits_output.normalized_prompt_logprobs.to(
+                            "cpu", non_blocking=True
+                        )
+                    )
             next_token_ids = next_token_ids.to("cpu", non_blocking=True)
             copy_event = torch.cuda.Event(blocking=True)
             copy_event.record()
-            self.copy_queue.put((copy_event, next_token_ids))
+            self.launch_event.set()
+            self.copy_queue.put((copy_event, logits_output, next_token_ids))
     def copy_thread_func(self):
         while True:
-            copy_event, next_token_ids = self.copy_queue.get()
+            copy_event, logits_output, next_token_ids = self.copy_queue.get()
+            if not copy_event:
+                break
             while not copy_event.query():
                 time.sleep(1e-5)
-            self.output_queue.put((None, next_token_ids.tolist()))
+            if logits_output.next_token_logprobs is not None:
+                logits_output.next_token_logprobs = (
+                    logits_output.next_token_logprobs.tolist()
+                )
+                if logits_output.input_token_logprobs is not None:
+                    logits_output.input_token_logprobs = (
+                        logits_output.input_token_logprobs.tolist()
+                    )
+                    logits_output.normalized_prompt_logprobs = (
+                        logits_output.normalized_prompt_logprobs.tolist()
+                    )
+            self.output_queue.put((logits_output, next_token_ids.tolist()))
     def resulve_batch_result(self, bid: int):
         logits_output, next_token_ids = self.output_queue.get()
@@ -149,8 +181,9 @@ class TpModelWorkerClient:
         # Allocate output future objects
         bs = len(model_worker_batch.seq_lens)
         future_next_token_ids = torch.arange(
-            -(self.future_token_ids_ct + bs),
-            -(self.future_token_ids_ct),
+            -(self.future_token_ids_ct + 1),
+            -(self.future_token_ids_ct + 1 + bs),
+            -1,
             dtype=torch.int32,
             device=self.device,
         )
@@ -170,3 +203,7 @@ class TpModelWorkerClient:
             recv_req.model_path, recv_req.load_format
         )
         return success, message
+    def __delete__(self):
+        self.input_queue.put((None, None))
+        self.copy_queue.put((None, None, None))

sglang 0.3.4.post1__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl

sglang 0.3.4.post1py3-none-any.whl → 0.3.4.post2py3-none-any.whl