PyPI - sglang - Versions diffs - 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl - Mend

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -41,8 +41,6 @@ from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.io_struct import (
     AbortReq,
-    BatchEmbeddingOut,
-    BatchTokenIDOut,
     CloseSessionReqInput,
     FlushCacheReq,
     GetInternalStateReq,
@@ -74,7 +72,6 @@ from sglang.srt.managers.io_struct import (
 )
 from sglang.srt.managers.schedule_batch import (
     FINISH_ABORT,
-    BaseFinishReason,
     ImageInputs,
     Req,
     ScheduleBatch,
@@ -85,6 +82,9 @@ from sglang.srt.managers.schedule_policy import (
     PrefillAdder,
     SchedulePolicy,
 )
+from sglang.srt.managers.scheduler_output_processor_mixin import (
+    SchedulerOutputProcessorMixin,
+)
 from sglang.srt.managers.session_controller import Session
 from sglang.srt.managers.tp_worker import TpModelWorker
 from sglang.srt.managers.tp_worker_overlap_thread import TpModelWorkerClient
@@ -93,7 +93,7 @@ from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.hiradix_cache import HiRadixCache
 from sglang.srt.mem_cache.radix_cache import RadixCache
 from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats
-from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
@@ -103,6 +103,7 @@ from sglang.srt.utils import (
     crash_on_warnings,
     get_bool_env_var,
     get_zmq_socket,
+    kill_itself_when_parent_died,
     pyspy_dump_schedulers,
     set_gpu_proc_affinity,
     set_random_seed,
@@ -132,7 +133,7 @@ class EmbeddingBatchResult:
     bid: int
-class Scheduler:
+class Scheduler(SchedulerOutputProcessorMixin):
     """A scheduler that manages a tensor parallel GPU worker."""
     def __init__(
@@ -159,6 +160,7 @@ class Scheduler:
         )
         self.gpu_id = gpu_id
         self.enable_hierarchical_cache = server_args.enable_hierarchical_cache
+        self.page_size = server_args.page_size
         # Distributed rank info
         self.dp_size = server_args.dp_size
@@ -270,17 +272,18 @@ class Scheduler:
         # Init running status
         self.waiting_queue: List[Req] = []
-        self.staging_reqs = {}
         # The running decoding batch for continuous batching
-        self.running_batch: Optional[ScheduleBatch] = None
+        self.running_batch: ScheduleBatch = ScheduleBatch(reqs=[], batch_is_full=False)
         # The current forward batch
         self.cur_batch: Optional[ScheduleBatch] = None
-        # The current forward batch
+        # The last forward batch
         self.last_batch: Optional[ScheduleBatch] = None
         self.forward_ct = 0
         self.forward_ct_decode = 0
         self.num_generated_tokens = 0
+        self.num_prefill_tokens = 0
         self.last_decode_stats_tic = time.time()
+        self.last_prefill_stats_tic = time.time()
         self.return_health_check_ct = 0
         self.current_stream = torch.get_device_module(self.device).current_stream()
         if self.device == "cpu":
@@ -308,7 +311,11 @@ class Scheduler:
             self.grammar_backend = None
         # Init schedule policy and new token estimation
-        self.policy = SchedulePolicy(self.schedule_policy, self.tree_cache)
+        self.policy = SchedulePolicy(
+            self.schedule_policy,
+            self.tree_cache,
+            self.enable_hierarchical_cache,
+        )
         assert (
             server_args.schedule_conservativeness >= 0
         ), "Invalid schedule_conservativeness"
@@ -327,11 +334,6 @@ class Scheduler:
         ) / global_config.default_new_token_ratio_decay_steps
         self.new_token_ratio = self.init_new_token_ratio
-        # Tell whether the current running batch is full so that we can skip
-        # the check of whether to prefill new requests.
-        # This is an optimization to reduce the overhead of the prefill check.
-        self.batch_is_full = False
         # Init watchdog thread
         self.watchdog_timeout = server_args.watchdog_timeout
         t = threading.Thread(target=self.watchdog_thread, daemon=True)
@@ -431,11 +433,14 @@ class Scheduler:
                 self.tree_cache = HiRadixCache(
                     req_to_token_pool=self.req_to_token_pool,
                     token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+                    tp_cache_group=self.tp_worker.get_tp_cpu_group(),
+                    page_size=self.page_size,
                 )
             else:
                 self.tree_cache = RadixCache(
                     req_to_token_pool=self.req_to_token_pool,
                     token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+                    page_size=self.page_size,
                     disable=server_args.disable_radix_cache,
                 )
@@ -457,6 +462,7 @@ class Scheduler:
         # The largest context length (prefill + generation) of a single request
         self._largest_prefill_decode_len: int = 0
         self.last_gen_throughput: float = 0.0
+        self.last_input_throughput: float = 0.0
         self.step_time_dict = defaultdict(list)  # Dict[batch size -> step time]
         self.spec_num_total_accepted_tokens = 0
         self.spec_num_total_forward_ct = 0
@@ -486,7 +492,7 @@ class Scheduler:
                 result = self.run_batch(batch)
                 self.process_batch_result(batch, result)
             else:
-                # When the server is idle, so self-check and re-init some states
+                # When the server is idle, do self-check and re-init some states
                 self.check_memory()
                 self.new_token_ratio = self.init_new_token_ratio
@@ -526,7 +532,7 @@ class Scheduler:
                 )
                 self.process_batch_result(tmp_batch, tmp_result)
             elif batch is None:
-                # When the server is idle, so self-check and re-init some states
+                # When the server is idle, do self-check and re-init some states
                 self.check_memory()
                 self.new_token_ratio = self.init_new_token_ratio
@@ -587,7 +593,7 @@ class Scheduler:
         for recv_req in recv_reqs:
             # If it is a health check generation request and there are running requests, ignore it.
             if is_health_check_generate_req(recv_req) and (
-                self.chunked_req is not None or self.running_batch is not None
+                self.chunked_req is not None or not self.running_batch.is_empty()
             ):
                 self.return_health_check_ct += 1
                 continue
@@ -767,6 +773,30 @@ class Scheduler:
         )
         req.tokenizer = self.tokenizer
+        # Handle multimodal inputs
+        if recv_req.image_inputs is not None:
+            image_inputs = ImageInputs.from_dict(recv_req.image_inputs)
+            # Expand a single image token into multiple dummy tokens for receiving image embeddings
+            req.origin_input_ids = self.pad_input_ids_func(
+                req.origin_input_ids, image_inputs
+            )
+            req.extend_image_inputs(image_inputs)
+            if len(req.origin_input_ids) >= self.max_req_input_len:
+                error_msg = (
+                    "Multimodal prompt is too long after expanding multimodal tokens. "
+                    f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
+                )
+                logger.error(error_msg)
+                req.origin_input_ids = [0]
+                req.image_inputs = None
+                req.sampling_params.max_new_tokens = 0
+                req.finished_reason = FINISH_ABORT(
+                    error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
+                )
+                self.waiting_queue.append(req)
+                return
         # Validate prompts length
         error_msg = validate_input_length(
             req,
@@ -787,6 +817,11 @@ class Scheduler:
         can_run_list: List[Req],
         running_bs: int,
     ):
+        gap_latency = time.time() - self.last_prefill_stats_tic
+        self.last_prefill_stats_tic = time.time()
+        self.last_input_throughput = self.num_prefill_tokens / gap_latency
+        self.num_prefill_tokens = 0
         num_used = self.max_total_num_tokens - (
             self.token_to_kv_pool_allocator.available_size()
             + self.tree_cache.evictable_size()
@@ -822,7 +857,7 @@ class Scheduler:
         self.last_decode_stats_tic = time.time()
         self.last_gen_throughput = self.num_generated_tokens / gap_latency
         self.num_generated_tokens = 0
-        num_running_reqs = len(self.running_batch.reqs) if self.running_batch else 0
+        num_running_reqs = len(self.running_batch.reqs)
         num_used = self.max_total_num_tokens - (
             self.token_to_kv_pool_allocator.available_size()
             + self.tree_cache.evictable_size()
@@ -886,8 +921,10 @@ class Scheduler:
         )
         if memory_leak:
             msg = (
-                "KV cache pool leak detected!"
+                "KV cache pool leak detected! "
                 f"{available_size=}, {protected_size=}, {self.max_total_num_tokens=}\n"
+                f"{self.token_to_kv_pool_allocator.available_size()=}\n"
+                f"{self.tree_cache.evictable_size()=}\n"
             )
             warnings.warn(msg)
             if crash_on_warnings():
@@ -913,7 +950,7 @@ class Scheduler:
                 self.token_to_kv_pool_allocator.available_size()
                 + self.tree_cache.evictable_size()
             )
-            num_running_reqs = len(self.running_batch.reqs) if self.running_batch else 0
+            num_running_reqs = len(self.running_batch.reqs)
             self.stats.num_running_reqs = num_running_reqs
             self.stats.num_used_tokens = num_used
             self.stats.token_usage = num_used / self.max_total_num_tokens
@@ -931,14 +968,20 @@ class Scheduler:
                 self.tree_cache.cache_unfinished_req(self.chunked_req)
                 # chunked request keeps its rid but will get a new req_pool_idx
                 self.req_to_token_pool.free(self.chunked_req.req_pool_idx)
-                self.batch_is_full = False
+                self.running_batch.batch_is_full = False
+            # Filter batch
+            last_bs = self.last_batch.batch_size()
             self.last_batch.filter_batch()
+            if self.last_batch.batch_size() < last_bs:
+                self.running_batch.batch_is_full = False
+            # Merge the new batch into the running batch
             if not self.last_batch.is_empty():
-                if self.running_batch is None:
+                if self.running_batch.is_empty():
                     self.running_batch = self.last_batch
                 else:
-                    # merge running_batch with prefill batch
+                    # Merge running_batch with prefill batch
                     self.running_batch.merge_batch(self.last_batch)
         new_batch = self.get_new_batch_prefill()
@@ -947,15 +990,15 @@ class Scheduler:
             ret = new_batch
         else:
             # Run decode
-            if self.running_batch is None:
-                ret = None
-            else:
+            if not self.running_batch.is_empty():
                 self.running_batch = self.update_running_batch(self.running_batch)
-                ret = self.running_batch
+                ret = self.running_batch if not self.running_batch.is_empty() else None
+            else:
+                ret = None
         # Handle DP attention
         if self.server_args.enable_dp_attention:
-            ret = self.prepare_dp_attn_batch(ret)
+            ret, _ = self.prepare_dp_attn_batch(ret)
         return ret
@@ -966,15 +1009,20 @@ class Scheduler:
         # Handle the cases where prefill is not allowed
         if (
-            self.batch_is_full or len(self.waiting_queue) == 0
+            self.running_batch.batch_is_full or len(self.waiting_queue) == 0
         ) and self.chunked_req is None:
             return None
-        running_bs = len(self.running_batch.reqs) if self.running_batch else 0
+        running_bs = len(self.running_batch.reqs)
         if running_bs >= self.max_running_requests:
-            self.batch_is_full = True
+            self.running_batch.batch_is_full = True
             return None
+        if self.enable_hierarchical_cache:
+            # check for completion of hierarchical cache activities to release memory
+            self.tree_cache.writing_check()
+            self.tree_cache.loading_check()
         # Get priority queue
         prefix_computed = self.policy.calc_priority(self.waiting_queue)
@@ -989,17 +1037,13 @@ class Scheduler:
             running_bs if self.is_mixed_chunk else 0,
         )
-        is_chunked = self.chunked_req is not None
-        if is_chunked:
+        if self.chunked_req is not None:
             self.chunked_req.init_next_round_input()
             self.chunked_req = adder.add_chunked_req(self.chunked_req)
         if self.lora_paths:
-            lora_set = (
-                set([req.lora_path for req in self.running_batch.reqs])
-                if self.running_batch is not None
-                else set([])
-            )
+            lora_set = set([req.lora_path for req in self.running_batch.reqs])
         # Get requests from the waiting queue to a new prefill batch
         for req in self.waiting_queue:
             if (
@@ -1011,49 +1055,33 @@ class Scheduler:
                 )
                 > self.max_loras_per_batch
             ):
-                self.batch_is_full = True
+                self.running_batch.batch_is_full = True
                 break
             if running_bs + len(adder.can_run_list) >= self.max_running_requests:
-                self.batch_is_full = True
+                self.running_batch.batch_is_full = True
                 break
-            req.init_next_round_input(None if prefix_computed else self.tree_cache)
+            req.init_next_round_input(
+                None if prefix_computed else self.tree_cache,
+                self.enable_hierarchical_cache,
+            )
-            if self.enable_hierarchical_cache and req.last_node is not None:
-                if req.last_node.evicted:
-                    # loading KV cache for the request
-                    req.last_node, req.prefix_indices = self.tree_cache.init_load_back(
-                        req.last_node,
-                        req.prefix_indices,
-                        adder.rem_total_tokens,
-                    )
-                    if req.last_node.loading:
-                        # to prevent frequent cache invalidation
-                        if req.rid in self.staging_reqs:
-                            self.tree_cache.dec_lock_ref(self.staging_reqs[req.rid])
-                        self.tree_cache.inc_lock_ref(req.last_node)
-                        self.staging_reqs[req.rid] = req.last_node
-                        continue
-                elif req.last_node.loading:
-                    if not self.tree_cache.loading_complete(req.last_node):
-                        continue
-                if req.rid in self.staging_reqs:
-                    self.tree_cache.dec_lock_ref(self.staging_reqs[req.rid])
-                    del self.staging_reqs[req.rid]
-            res = adder.add_one_req(req, self.chunked_req)
+            res = adder.add_one_req(
+                req, self.chunked_req, self.enable_hierarchical_cache
+            )
             if res != AddReqResult.CONTINUE:
                 if res == AddReqResult.NO_TOKEN:
                     if self.enable_hierarchical_cache:
                         # Set batch_is_full after making sure there are requests that can be served
-                        self.batch_is_full = len(adder.can_run_list) > 0 or (
+                        self.running_batch.batch_is_full = len(
+                            adder.can_run_list
+                        ) > 0 or (
                             self.running_batch is not None
                             and not self.running_batch.is_empty()
                         )
                     else:
-                        self.batch_is_full = True
+                        self.running_batch.batch_is_full = True
                 break
         # Update waiting queue
@@ -1064,6 +1092,9 @@ class Scheduler:
             x for x in self.waiting_queue if x not in set(can_run_list)
         ]
+        if self.enable_hierarchical_cache:
+            self.tree_cache.read_to_load_cache()
         if adder.new_chunked_req is not None:
             assert self.chunked_req is None
             self.chunked_req = adder.new_chunked_req
@@ -1091,7 +1122,7 @@ class Scheduler:
         # Mixed-style chunked prefill
         if (
             self.is_mixed_chunk
-            and self.running_batch is not None
+            and not self.running_batch.is_empty()
             and not (new_batch.return_logprob or self.running_batch.return_logprob)
         ):
             # TODO (lianmin): support return_logprob + mixed chunked prefill
@@ -1100,7 +1131,9 @@ class Scheduler:
                 self.running_batch.prepare_for_decode()
                 new_batch.mix_with_running(self.running_batch)
                 new_batch.decoding_reqs = self.running_batch.reqs
-            self.running_batch = None
+            self.running_batch = ScheduleBatch(
+                reqs=[], batch_is_full=self.running_batch.batch_is_full
+            )
         else:
             new_batch.decoding_reqs = None
@@ -1112,8 +1145,8 @@ class Scheduler:
         batch.filter_batch()
         if batch.is_empty():
-            self.batch_is_full = False
-            return None
+            batch.batch_is_full = False
+            return batch
         # Check if decode out of memory
         if not batch.check_decode_mem(self.decode_mem_cache_buf_multiplier) or (
@@ -1137,7 +1170,7 @@ class Scheduler:
             )
         if batch.batch_size() < initial_bs:
-            self.batch_is_full = False
+            batch.batch_is_full = False
         # Update batch tensors
         batch.prepare_for_decode()
@@ -1212,8 +1245,6 @@ class Scheduler:
     ):
         if batch.forward_mode.is_decode():
             self.process_batch_result_decode(batch, result)
-            if batch.is_empty():
-                self.running_batch = None
         elif batch.forward_mode.is_extend():
             self.process_batch_result_prefill(batch, result)
         elif batch.forward_mode.is_idle():
@@ -1235,615 +1266,76 @@ class Scheduler:
             self.return_health_check_ct -= 1
             self.send_to_tokenizer.send_pyobj(HealthCheckOutput())
-    def process_batch_result_prefill(
-        self,
-        batch: ScheduleBatch,
-        result: Union[GenerationBatchResult, EmbeddingBatchResult],
-    ):
-        skip_stream_req = None
-        if self.is_generation:
-            (
-                logits_output,
-                next_token_ids,
-                extend_input_len_per_req,
-                extend_logprob_start_len_per_req,
-                bid,
-            ) = (
-                result.logits_output,
-                result.next_token_ids,
-                result.extend_input_len_per_req,
-                result.extend_logprob_start_len_per_req,
-                result.bid,
-            )
-            if self.enable_overlap:
-                logits_output, next_token_ids = self.tp_worker.resolve_batch_result(bid)
-            else:
-                # Move next_token_ids and logprobs to cpu
-                next_token_ids = next_token_ids.tolist()
-                if batch.return_logprob:
-                    if logits_output.next_token_logprobs is not None:
-                        logits_output.next_token_logprobs = (
-                            logits_output.next_token_logprobs.tolist()
-                        )
-                    if logits_output.input_token_logprobs is not None:
-                        logits_output.input_token_logprobs = tuple(
-                            logits_output.input_token_logprobs.tolist()
-                        )
-            hidden_state_offset = 0
-            # Check finish conditions
-            logprob_pt = 0
-            for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
-                if req.is_retracted:
-                    continue
-                if self.is_mixed_chunk and self.enable_overlap and req.finished():
-                    # Free the one delayed token for the mixed decode batch
-                    j = len(batch.out_cache_loc) - len(batch.reqs) + i
-                    self.token_to_kv_pool_allocator.free(batch.out_cache_loc[j : j + 1])
-                    continue
-                if req.is_chunked <= 0:
-                    # req output_ids are set here
-                    req.output_ids.append(next_token_id)
-                    req.check_finished()
-                    if req.finished():
-                        self.tree_cache.cache_finished_req(req)
-                    elif not batch.decoding_reqs or req not in batch.decoding_reqs:
-                        # This updates radix so others can match
-                        self.tree_cache.cache_unfinished_req(req)
-                    if req.return_logprob:
-                        assert extend_logprob_start_len_per_req is not None
-                        assert extend_input_len_per_req is not None
-                        extend_logprob_start_len = extend_logprob_start_len_per_req[i]
-                        extend_input_len = extend_input_len_per_req[i]
-                        num_input_logprobs = extend_input_len - extend_logprob_start_len
-                        self.add_logprob_return_values(
-                            i,
-                            req,
-                            logprob_pt,
-                            next_token_ids,
-                            num_input_logprobs,
-                            logits_output,
-                        )
-                        logprob_pt += num_input_logprobs
-                    if (
-                        req.return_hidden_states
-                        and logits_output.hidden_states is not None
-                    ):
-                        req.hidden_states.append(
-                            logits_output.hidden_states[
-                                hidden_state_offset : (
-                                    hidden_state_offset := hidden_state_offset
-                                    + len(req.origin_input_ids)
-                                )
-                            ]
-                            .cpu()
-                            .clone()
-                        )
-                    if req.grammar is not None:
-                        req.grammar.accept_token(next_token_id)
-                        req.grammar.finished = req.finished()
-                else:
-                    # being chunked reqs' prefill is not finished
-                    req.is_chunked -= 1
-                    # There is only at most one request being currently chunked.
-                    # Because this request does not finish prefill,
-                    # we don't want to stream the request currently being chunked.
-                    skip_stream_req = req
-                    # Incrementally update input logprobs.
-                    if req.return_logprob:
-                        extend_logprob_start_len = extend_logprob_start_len_per_req[i]
-                        extend_input_len = extend_input_len_per_req[i]
-                        if extend_logprob_start_len < extend_input_len:
-                            # Update input logprobs.
-                            num_input_logprobs = (
-                                extend_input_len - extend_logprob_start_len
-                            )
-                            self.add_input_logprob_return_values(
-                                i,
-                                req,
-                                logits_output,
-                                logprob_pt,
-                                num_input_logprobs,
-                                last_prefill_chunk=False,
-                            )
-                            logprob_pt += num_input_logprobs
-            if batch.next_batch_sampling_info:
-                batch.next_batch_sampling_info.update_regex_vocab_mask()
-                self.current_stream.synchronize()
-                batch.next_batch_sampling_info.sampling_info_done.set()
-        else:  # embedding or reward model
-            embeddings, bid = result.embeddings, result.bid
-            embeddings = embeddings.tolist()
-            # Check finish conditions
-            for i, req in enumerate(batch.reqs):
-                if req.is_retracted:
-                    continue
-                req.embedding = embeddings[i]
-                if req.is_chunked <= 0:
-                    # Dummy output token for embedding models
-                    req.output_ids.append(0)
-                    req.check_finished()
-                    if req.finished():
-                        self.tree_cache.cache_finished_req(req)
-                    else:
-                        self.tree_cache.cache_unfinished_req(req)
-                else:
-                    # being chunked reqs' prefill is not finished
-                    req.is_chunked -= 1
-        self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)
-    def process_batch_result_decode(
-        self,
-        batch: ScheduleBatch,
-        result: GenerationBatchResult,
-    ):
-        logits_output, next_token_ids, bid = (
-            result.logits_output,
-            result.next_token_ids,
-            result.bid,
-        )
-        self.num_generated_tokens += len(batch.reqs)
-        if self.enable_overlap:
-            assert batch.spec_algorithm.is_none()
-            logits_output, next_token_ids = self.tp_worker.resolve_batch_result(bid)
-            next_token_logprobs = logits_output.next_token_logprobs
-        elif batch.spec_algorithm.is_none():
-            # spec decoding handles output logprobs inside verify process.
-            next_token_ids = next_token_ids.tolist()
-            if batch.return_logprob:
-                next_token_logprobs = logits_output.next_token_logprobs.tolist()
-        self.token_to_kv_pool_allocator.free_group_begin()
-        # Check finish condition
-        # NOTE: the length of reqs and next_token_ids don't match if it is spec decoding.
-        # We should ignore using next_token_ids for spec decoding cases.
-        for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
-            if req.is_retracted:
-                continue
-            if self.enable_overlap and req.finished():
-                # Free the one delayed token
-                self.token_to_kv_pool_allocator.free(batch.out_cache_loc[i : i + 1])
-                continue
-            if batch.spec_algorithm.is_none():
-                # speculative worker will solve the output_ids in speculative decoding
-                req.output_ids.append(next_token_id)
-            req.check_finished()
-            if req.finished():
-                self.tree_cache.cache_finished_req(req)
-            if req.return_logprob and batch.spec_algorithm.is_none():
-                # speculative worker handles logprob in speculative decoding
-                req.output_token_logprobs_val.append(next_token_logprobs[i])
-                req.output_token_logprobs_idx.append(next_token_id)
-                if req.top_logprobs_num > 0:
-                    req.output_top_logprobs_val.append(
-                        logits_output.next_token_top_logprobs_val[i]
-                    )
-                    req.output_top_logprobs_idx.append(
-                        logits_output.next_token_top_logprobs_idx[i]
-                    )
-                if req.token_ids_logprob is not None:
-                    req.output_token_ids_logprobs_val.append(
-                        logits_output.next_token_token_ids_logprobs_val[i]
-                    )
-                    req.output_token_ids_logprobs_idx.append(
-                        logits_output.next_token_token_ids_logprobs_idx[i]
-                    )
-            if req.return_hidden_states and logits_output.hidden_states is not None:
-                req.hidden_states.append(logits_output.hidden_states[i].cpu().clone())
-            if req.grammar is not None and batch.spec_algorithm.is_none():
-                req.grammar.accept_token(next_token_id)
-                req.grammar.finished = req.finished()
-        if batch.next_batch_sampling_info:
-            batch.next_batch_sampling_info.update_regex_vocab_mask()
-            self.current_stream.synchronize()
-            batch.next_batch_sampling_info.sampling_info_done.set()
-        self.stream_output(batch.reqs, batch.return_logprob)
-        self.token_to_kv_pool_allocator.free_group_end()
-        self.forward_ct_decode = (self.forward_ct_decode + 1) % (1 << 30)
-        if (
-            self.attn_tp_rank == 0
-            and self.forward_ct_decode % self.server_args.decode_log_interval == 0
-        ):
-            self.log_decode_stats()
-    def add_input_logprob_return_values(
-        self,
-        i: int,
-        req: Req,
-        output: LogitsProcessorOutput,
-        logprob_pt: int,
-        num_input_logprobs: int,
-        last_prefill_chunk: bool,  # If True, it means prefill is finished.
-    ):
-        """Incrementally add input logprobs to `req`.
-        Args:
-            i: The request index in a batch.
-            req: The request. Input logprobs inside req are modified as a
-                consequence of the API
-            fill_ids: The prefill ids processed.
-            output: Logit processor output that's used to compute input logprobs
-            last_prefill_chunk: True if it is the last prefill (when chunked).
-                Some of input logprob operation should only happen at the last
-                prefill (e.g., computing input token logprobs).
-        """
-        assert output.input_token_logprobs is not None
-        if req.input_token_logprobs is None:
-            req.input_token_logprobs = []
-        if req.temp_input_top_logprobs_val is None:
-            req.temp_input_top_logprobs_val = []
-        if req.temp_input_top_logprobs_idx is None:
-            req.temp_input_top_logprobs_idx = []
-        if req.temp_input_token_ids_logprobs_val is None:
-            req.temp_input_token_ids_logprobs_val = []
-        if req.temp_input_token_ids_logprobs_idx is None:
-            req.temp_input_token_ids_logprobs_idx = []
-        if req.input_token_logprobs_val is not None:
-            # The input logprob has been already computed. It only happens
-            # upon retract.
-            if req.top_logprobs_num > 0:
-                assert req.input_token_logprobs_val is not None
-            return
-        # Important for the performance.
-        assert isinstance(output.input_token_logprobs, tuple)
-        input_token_logprobs: Tuple[int] = output.input_token_logprobs
-        input_token_logprobs = input_token_logprobs[
-            logprob_pt : logprob_pt + num_input_logprobs
-        ]
-        req.input_token_logprobs.extend(input_token_logprobs)
-        if req.top_logprobs_num > 0:
-            req.temp_input_top_logprobs_val.append(output.input_top_logprobs_val[i])
-            req.temp_input_top_logprobs_idx.append(output.input_top_logprobs_idx[i])
-        if req.token_ids_logprob is not None:
-            req.temp_input_token_ids_logprobs_val.append(
-                output.input_token_ids_logprobs_val[i]
-            )
-            req.temp_input_token_ids_logprobs_idx.append(
-                output.input_token_ids_logprobs_idx[i]
-            )
-        if last_prefill_chunk:
-            input_token_logprobs = req.input_token_logprobs
-            req.input_token_logprobs = None
-            assert req.input_token_logprobs_val is None
-            assert req.input_token_logprobs_idx is None
-            assert req.input_top_logprobs_val is None
-            assert req.input_top_logprobs_idx is None
-            # Compute input_token_logprobs_val
-            # Always pad the first one with None.
-            req.input_token_logprobs_val = [None]
-            req.input_token_logprobs_val.extend(input_token_logprobs)
-            # The last input logprob is for sampling, so just pop it out.
-            req.input_token_logprobs_val.pop()
-            # Compute input_token_logprobs_idx
-            input_token_logprobs_idx = req.origin_input_ids[req.logprob_start_len :]
-            # Clip the padded hash values from image tokens.
-            # Otherwise, it will lead to detokenization errors.
-            input_token_logprobs_idx = [
-                x if x < self.model_config.vocab_size - 1 else 0
-                for x in input_token_logprobs_idx
-            ]
-            req.input_token_logprobs_idx = input_token_logprobs_idx
-            if req.top_logprobs_num > 0:
-                req.input_top_logprobs_val = [None]
-                req.input_top_logprobs_idx = [None]
-                assert len(req.temp_input_token_ids_logprobs_val) == len(
-                    req.temp_input_token_ids_logprobs_idx
-                )
-                for val, idx in zip(
-                    req.temp_input_top_logprobs_val,
-                    req.temp_input_top_logprobs_idx,
-                    strict=True,
-                ):
-                    req.input_top_logprobs_val.extend(val)
-                    req.input_top_logprobs_idx.extend(idx)
-                # Last token is a sample token.
-                req.input_top_logprobs_val.pop()
-                req.input_top_logprobs_idx.pop()
-                req.temp_input_top_logprobs_idx = None
-                req.temp_input_top_logprobs_val = None
-            if req.token_ids_logprob is not None:
-                req.input_token_ids_logprobs_val = [None]
-                req.input_token_ids_logprobs_idx = [None]
-                for val, idx in zip(
-                    req.temp_input_token_ids_logprobs_val,
-                    req.temp_input_token_ids_logprobs_idx,
-                    strict=True,
-                ):
-                    req.input_token_ids_logprobs_val.extend(val)
-                    req.input_token_ids_logprobs_idx.extend(idx)
-                # Last token is a sample token.
-                req.input_token_ids_logprobs_val.pop()
-                req.input_token_ids_logprobs_idx.pop()
-                req.temp_input_token_ids_logprobs_idx = None
-                req.temp_input_token_ids_logprobs_val = None
-            if req.return_logprob:
-                relevant_tokens_len = len(req.origin_input_ids) - req.logprob_start_len
-                assert len(req.input_token_logprobs_val) == relevant_tokens_len
-                assert len(req.input_token_logprobs_idx) == relevant_tokens_len
-                if req.top_logprobs_num > 0:
-                    assert len(req.input_top_logprobs_val) == relevant_tokens_len
-                    assert len(req.input_top_logprobs_idx) == relevant_tokens_len
-                if req.token_ids_logprob is not None:
-                    assert len(req.input_token_ids_logprobs_val) == relevant_tokens_len
-                    assert len(req.input_token_ids_logprobs_idx) == relevant_tokens_len
-    def add_logprob_return_values(
-        self,
-        i: int,
-        req: Req,
-        pt: int,
-        next_token_ids: List[int],
-        num_input_logprobs: int,
-        output: LogitsProcessorOutput,
-    ):
-        """Attach logprobs to the return values."""
-        req.output_token_logprobs_val.append(output.next_token_logprobs[i])
-        req.output_token_logprobs_idx.append(next_token_ids[i])
-        self.add_input_logprob_return_values(
-            i, req, output, pt, num_input_logprobs, last_prefill_chunk=True
-        )
-        if req.top_logprobs_num > 0:
-            req.output_top_logprobs_val.append(output.next_token_top_logprobs_val[i])
-            req.output_top_logprobs_idx.append(output.next_token_top_logprobs_idx[i])
-        if req.token_ids_logprob is not None:
-            req.output_token_ids_logprobs_val.append(
-                output.next_token_token_ids_logprobs_val[i]
-            )
-            req.output_token_ids_logprobs_idx.append(
-                output.next_token_token_ids_logprobs_idx[i]
-            )
-        return num_input_logprobs
-    def stream_output(
-        self, reqs: List[Req], return_logprob: bool, skip_req: Optional[Req] = None
-    ):
-        """Stream the output to detokenizer."""
-        rids = []
-        finished_reasons: List[BaseFinishReason] = []
-        if self.is_generation:
-            decoded_texts = []
-            decode_ids_list = []
-            read_offsets = []
-            output_ids = []
-            skip_special_tokens = []
-            spaces_between_special_tokens = []
-            no_stop_trim = []
-            prompt_tokens = []
-            completion_tokens = []
-            cached_tokens = []
-            spec_verify_ct = []
-            output_hidden_states = None
-            if return_logprob:
-                input_token_logprobs_val = []
-                input_token_logprobs_idx = []
-                output_token_logprobs_val = []
-                output_token_logprobs_idx = []
-                input_top_logprobs_val = []
-                input_top_logprobs_idx = []
-                output_top_logprobs_val = []
-                output_top_logprobs_idx = []
-                input_token_ids_logprobs_val = []
-                input_token_ids_logprobs_idx = []
-                output_token_ids_logprobs_val = []
-                output_token_ids_logprobs_idx = []
-            else:
-                input_token_logprobs_val = input_token_logprobs_idx = (
-                    output_token_logprobs_val
-                ) = output_token_logprobs_idx = input_top_logprobs_val = (
-                    input_top_logprobs_idx
-                ) = output_top_logprobs_val = output_top_logprobs_idx = (
-                    input_token_ids_logprobs_val
-                ) = input_token_ids_logprobs_idx = output_token_ids_logprobs_val = (
-                    output_token_ids_logprobs_idx
-                ) = None
-            for req in reqs:
-                if req is skip_req:
-                    continue
-                # Multimodal partial stream chunks break the detokenizer, so drop aborted requests here.
-                if self.model_config.is_multimodal_gen and req.to_abort:
-                    continue
-                if (
-                    req.finished()
-                    # If stream, follow the given stream_interval
-                    or (req.stream and len(req.output_ids) % self.stream_interval == 0)
-                    # If not stream, we still want to output some tokens to get the benefit of incremental decoding.
-                    # TODO(lianmin): this is wrong for speculative decoding because len(req.output_ids) does not
-                    # always increase one-by-one.
-                    or (
-                        not req.stream
-                        and len(req.output_ids) % 50 == 0
-                        and not self.model_config.is_multimodal_gen
-                    )
-                ):
-                    rids.append(req.rid)
-                    finished_reasons.append(
-                        req.finished_reason.to_json() if req.finished_reason else None
-                    )
-                    decoded_texts.append(req.decoded_text)
-                    decode_ids, read_offset = req.init_incremental_detokenize()
-                    decode_ids_list.append(decode_ids)
-                    read_offsets.append(read_offset)
-                    if self.skip_tokenizer_init:
-                        output_ids.append(req.output_ids)
-                    skip_special_tokens.append(req.sampling_params.skip_special_tokens)
-                    spaces_between_special_tokens.append(
-                        req.sampling_params.spaces_between_special_tokens
-                    )
-                    no_stop_trim.append(req.sampling_params.no_stop_trim)
-                    prompt_tokens.append(len(req.origin_input_ids))
-                    completion_tokens.append(len(req.output_ids))
-                    cached_tokens.append(req.cached_tokens)
-                    if not self.spec_algorithm.is_none():
-                        spec_verify_ct.append(req.spec_verify_ct)
-                    if return_logprob:
-                        input_token_logprobs_val.append(req.input_token_logprobs_val)
-                        input_token_logprobs_idx.append(req.input_token_logprobs_idx)
-                        output_token_logprobs_val.append(req.output_token_logprobs_val)
-                        output_token_logprobs_idx.append(req.output_token_logprobs_idx)
-                        input_top_logprobs_val.append(req.input_top_logprobs_val)
-                        input_top_logprobs_idx.append(req.input_top_logprobs_idx)
-                        output_top_logprobs_val.append(req.output_top_logprobs_val)
-                        output_top_logprobs_idx.append(req.output_top_logprobs_idx)
-                        input_token_ids_logprobs_val.append(
-                            req.input_token_ids_logprobs_val
-                        )
-                        input_token_ids_logprobs_idx.append(
-                            req.input_token_ids_logprobs_idx
-                        )
-                        output_token_ids_logprobs_val.append(
-                            req.output_token_ids_logprobs_val
-                        )
-                        output_token_ids_logprobs_idx.append(
-                            req.output_token_ids_logprobs_idx
-                        )
-                    if req.return_hidden_states:
-                        if output_hidden_states is None:
-                            output_hidden_states = []
-                        output_hidden_states.append(req.hidden_states)
-            # Send to detokenizer
-            if rids:
-                if self.model_config.is_multimodal_gen:
-                    raise NotImplementedError()
-                self.send_to_detokenizer.send_pyobj(
-                    BatchTokenIDOut(
-                        rids,
-                        finished_reasons,
-                        decoded_texts,
-                        decode_ids_list,
-                        read_offsets,
-                        output_ids,
-                        skip_special_tokens,
-                        spaces_between_special_tokens,
-                        no_stop_trim,
-                        prompt_tokens,
-                        completion_tokens,
-                        cached_tokens,
-                        spec_verify_ct,
-                        input_token_logprobs_val,
-                        input_token_logprobs_idx,
-                        output_token_logprobs_val,
-                        output_token_logprobs_idx,
-                        input_top_logprobs_val,
-                        input_top_logprobs_idx,
-                        output_top_logprobs_val,
-                        output_top_logprobs_idx,
-                        input_token_ids_logprobs_val,
-                        input_token_ids_logprobs_idx,
-                        output_token_ids_logprobs_val,
-                        output_token_ids_logprobs_idx,
-                        output_hidden_states,
-                    )
-                )
-        else:  # embedding or reward model
-            embeddings = []
-            prompt_tokens = []
-            cached_tokens = []
-            for req in reqs:
-                if req.finished():
-                    rids.append(req.rid)
-                    finished_reasons.append(req.finished_reason.to_json())
-                    embeddings.append(req.embedding)
-                    prompt_tokens.append(len(req.origin_input_ids))
-                    cached_tokens.append(req.cached_tokens)
-            self.send_to_detokenizer.send_pyobj(
-                BatchEmbeddingOut(
-                    rids, finished_reasons, embeddings, prompt_tokens, cached_tokens
-                )
-            )
     def prepare_dp_attn_batch(self, local_batch: ScheduleBatch):
         # Check if other DP workers have running batches
         if local_batch is None:
             num_tokens = 0
+            global_num_tokens_for_logprob = 0
         elif local_batch.forward_mode.is_decode():
             num_tokens = local_batch.batch_size()
+            if not self.spec_algorithm.is_none() and self.spec_algorithm.is_eagle():
+                num_tokens = num_tokens * self.server_args.speculative_num_draft_tokens
+            global_num_tokens_for_logprob = num_tokens
         else:
             num_tokens = local_batch.extend_num_tokens
+            global_num_tokens_for_logprob = sum(
+                [
+                    # We should have at least 1 token for sample in every case.
+                    max(extend_len - logprob_start_len, 1)
+                    for logprob_start_len, extend_len in zip(
+                        local_batch.extend_logprob_start_lens, local_batch.extend_lens
+                    )
+                ]
+            )
+        if local_batch is None or local_batch.forward_mode.is_decode_or_idle():
+            can_cuda_graph = 1
+        else:
+            can_cuda_graph = 0
-        local_num_tokens = torch.tensor([num_tokens], dtype=torch.int64)
-        global_num_tokens = torch.empty(self.tp_size, dtype=torch.int64)
+        if not self.spec_algorithm.is_none():
+            # TODO(sang): Support cuda graph when idle batch is there.
+            if local_batch is None or local_batch.forward_mode.is_idle():
+                can_cuda_graph = 0
+        is_extend_in_batch = (
+            local_batch.forward_mode.is_extend() if local_batch else False
+        )
+        local_info = torch.tensor(
+            [
+                num_tokens,
+                can_cuda_graph,
+                global_num_tokens_for_logprob,
+                is_extend_in_batch,
+            ],
+            dtype=torch.int64,
+        )
+        global_info = torch.empty(
+            (self.server_args.dp_size, self.attn_tp_size, 4),
+            dtype=torch.int64,
+        )
         torch.distributed.all_gather_into_tensor(
-            global_num_tokens,
-            local_num_tokens,
+            global_info.flatten(),
+            local_info,
             group=self.tp_cpu_group,
         )
+        global_num_tokens = global_info[:, 0, 0].tolist()
+        can_cuda_graph = min(global_info[:, 0, 1].tolist())
+        global_num_tokens_for_logprob = global_info[:, 0, 2].tolist()
+        is_extend_in_batch = global_info[:, 0, 3].tolist()
-        if local_batch is None and global_num_tokens.max().item() > 0:
+        if local_batch is None and max(global_num_tokens) > 0:
             local_batch = self.get_idle_batch()
         if local_batch is not None:
-            local_batch.global_num_tokens = global_num_tokens.tolist()
+            local_batch.global_num_tokens = global_num_tokens
+            local_batch.global_num_tokens_for_logprob = global_num_tokens_for_logprob
             # Check forward mode for cuda graph
             if not self.server_args.disable_cuda_graph:
-                forward_mode_state = torch.tensor(
-                    (1 if local_batch.forward_mode.is_decode_or_idle() else 0),
-                    dtype=torch.int32,
-                )
-                torch.distributed.all_reduce(
-                    forward_mode_state,
-                    op=torch.distributed.ReduceOp.MIN,
-                    group=self.tp_cpu_group,
-                )
-                local_batch.can_run_dp_cuda_graph = forward_mode_state.item() == 1
+                local_batch.can_run_dp_cuda_graph = can_cuda_graph
-        return local_batch
+        return local_batch, any(is_extend_in_batch)
     def get_idle_batch(self):
         idle_batch = ScheduleBatch.init_new(
@@ -1926,9 +1418,7 @@ class Scheduler:
     def flush_cache(self):
         """Flush the memory pool and cache."""
-        if len(self.waiting_queue) == 0 and (
-            self.running_batch is None or len(self.running_batch.reqs) == 0
-        ):
+        if len(self.waiting_queue) == 0 and self.running_batch.is_empty():
             self.cur_batch = None
             self.last_batch = None
             self.tree_cache.reset()
@@ -1954,7 +1444,7 @@ class Scheduler:
             logging.warning(
                 f"Cache not flushed because there are pending requests. "
                 f"#queue-req: {len(self.waiting_queue)}, "
-                f"#running-req: {0 if self.running_batch is None else len(self.running_batch.reqs)}"
+                f"#running-req: {len(self.running_batch.reqs)}"
             )
             if_success = False
         return if_success
@@ -2004,24 +1494,24 @@ class Scheduler:
     def abort_request(self, recv_req: AbortReq):
         # Delete requests in the waiting queue
-        to_del = None
+        to_del = []
         for i, req in enumerate(self.waiting_queue):
-            if req.rid == recv_req.rid:
-                to_del = i
+            if req.rid.startswith(recv_req.rid):
+                to_del.append(i)
                 break
-        if to_del is not None:
-            del self.waiting_queue[to_del]
+        # Sort in reverse order to avoid index issues when deleting
+        for i in sorted(to_del, reverse=True):
+            req = self.waiting_queue.pop(i)
             logger.debug(f"Abort queued request. {req.rid=}")
             return
         # Delete requests in the running batch
-        if self.running_batch:
-            for req in self.running_batch.reqs:
-                if req.rid == recv_req.rid and not req.finished():
-                    logger.debug(f"Abort running request. {req.rid=}")
-                    req.to_abort = True
-                    break
+        for req in self.running_batch.reqs:
+            if req.rid.startswith(recv_req.rid) and not req.finished():
+                logger.debug(f"Abort running request. {req.rid=}")
+                req.to_abort = True
+                return
     def _pause_engine(self) -> Tuple[List[Req], int]:
         raise NotImplementedError()
@@ -2228,9 +1718,16 @@ def run_scheduler_process(
     dp_rank: Optional[int],
     pipe_writer,
 ):
+    # Generate the prefix
+    if dp_rank is None:
+        prefix = f" TP{tp_rank}"
+    else:
+        prefix = f" DP{dp_rank} TP{tp_rank}"
     # Config the process
     # kill_itself_when_parent_died()  # This is disabled because it does not work for `--dp 2`
-    setproctitle.setproctitle(f"sglang::scheduler_{dp_rank}")
+    setproctitle.setproctitle(f"sglang::scheduler{prefix.replace(' ', '_')}")
     faulthandler.enable()
     parent_process = psutil.Process().parent()
@@ -2239,10 +1736,6 @@ def run_scheduler_process(
         dp_rank = int(os.environ["SGLANG_DP_RANK"])
     # Configure the logger
-    if dp_rank is None:
-        prefix = f" TP{tp_rank}"
-    else:
-        prefix = f" DP{dp_rank} TP{tp_rank}"
     configure_logger(server_args, prefix=prefix)
     suppress_other_loggers()

sglang 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl