PyPI - sglang - Versions diffs - 0.3.6.post2__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

sglang 0.3.6.post2py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

sglang/bench_offline_throughput.py +55 -2
sglang/bench_one_batch.py +7 -6
sglang/bench_one_batch_server.py +4 -3
sglang/bench_serving.py +13 -0
sglang/check_env.py +1 -1
sglang/launch_server.py +3 -2
sglang/srt/_custom_ops.py +118 -0
sglang/srt/configs/device_config.py +17 -0
sglang/srt/configs/load_config.py +84 -0
sglang/srt/configs/model_config.py +161 -4
sglang/srt/configs/qwen2vl.py +5 -8
sglang/srt/constrained/outlines_backend.py +6 -1
sglang/srt/constrained/outlines_jump_forward.py +8 -1
sglang/srt/distributed/__init__.py +3 -0
sglang/srt/distributed/communication_op.py +34 -0
sglang/srt/distributed/device_communicators/__init__.py +0 -0
sglang/srt/distributed/device_communicators/cuda_wrapper.py +182 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +352 -0
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +291 -0
sglang/srt/distributed/device_communicators/hpu_communicator.py +48 -0
sglang/srt/distributed/device_communicators/pynccl.py +204 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +362 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +568 -0
sglang/srt/distributed/device_communicators/xpu_communicator.py +47 -0
sglang/srt/distributed/parallel_state.py +1275 -0
sglang/srt/distributed/utils.py +223 -0
sglang/srt/hf_transformers_utils.py +37 -1
sglang/srt/layers/attention/flashinfer_backend.py +13 -15
sglang/srt/layers/attention/torch_native_backend.py +285 -0
sglang/srt/layers/fused_moe_patch.py +20 -11
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/logits_processor.py +17 -3
sglang/srt/layers/quantization/__init__.py +34 -0
sglang/srt/layers/vocab_parallel_embedding.py +1 -0
sglang/srt/lora/lora.py +1 -1
sglang/srt/managers/data_parallel_controller.py +7 -11
sglang/srt/managers/detokenizer_manager.py +7 -4
sglang/srt/managers/image_processor.py +1 -1
sglang/srt/managers/io_struct.py +48 -12
sglang/srt/managers/schedule_batch.py +42 -36
sglang/srt/managers/schedule_policy.py +7 -4
sglang/srt/managers/scheduler.py +111 -46
sglang/srt/managers/session_controller.py +0 -3
sglang/srt/managers/tokenizer_manager.py +169 -100
sglang/srt/managers/tp_worker.py +36 -3
sglang/srt/managers/tp_worker_overlap_thread.py +32 -5
sglang/srt/model_executor/cuda_graph_runner.py +16 -7
sglang/srt/model_executor/forward_batch_info.py +9 -4
sglang/srt/model_executor/model_runner.py +136 -150
sglang/srt/model_loader/__init__.py +34 -0
sglang/srt/model_loader/loader.py +1139 -0
sglang/srt/model_loader/utils.py +41 -0
sglang/srt/model_loader/weight_utils.py +640 -0
sglang/srt/models/baichuan.py +9 -10
sglang/srt/models/chatglm.py +6 -15
sglang/srt/models/commandr.py +2 -3
sglang/srt/models/dbrx.py +2 -3
sglang/srt/models/deepseek.py +4 -11
sglang/srt/models/deepseek_v2.py +3 -11
sglang/srt/models/exaone.py +2 -3
sglang/srt/models/gemma.py +2 -6
sglang/srt/models/gemma2.py +3 -14
sglang/srt/models/gemma2_reward.py +0 -1
sglang/srt/models/gpt2.py +5 -12
sglang/srt/models/gpt_bigcode.py +6 -22
sglang/srt/models/grok.py +14 -51
sglang/srt/models/internlm2.py +2 -3
sglang/srt/models/internlm2_reward.py +0 -1
sglang/srt/models/llama.py +97 -27
sglang/srt/models/llama_classification.py +1 -2
sglang/srt/models/llama_embedding.py +1 -2
sglang/srt/models/llama_reward.py +2 -3
sglang/srt/models/llava.py +10 -12
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +4 -7
sglang/srt/models/minicpm3.py +6 -19
sglang/srt/models/mixtral.py +12 -5
sglang/srt/models/mixtral_quant.py +2 -3
sglang/srt/models/mllama.py +3 -7
sglang/srt/models/olmo.py +2 -8
sglang/srt/models/olmo2.py +391 -0
sglang/srt/models/olmoe.py +3 -5
sglang/srt/models/phi3_small.py +8 -8
sglang/srt/models/qwen.py +2 -3
sglang/srt/models/qwen2.py +10 -9
sglang/srt/models/qwen2_moe.py +4 -11
sglang/srt/models/qwen2_vl.py +12 -9
sglang/srt/models/registry.py +99 -0
sglang/srt/models/stablelm.py +2 -3
sglang/srt/models/torch_native_llama.py +6 -12
sglang/srt/models/xverse.py +2 -4
sglang/srt/models/xverse_moe.py +4 -11
sglang/srt/models/yivl.py +2 -3
sglang/srt/openai_api/adapter.py +10 -6
sglang/srt/openai_api/protocol.py +1 -0
sglang/srt/server.py +303 -204
sglang/srt/server_args.py +65 -31
sglang/srt/utils.py +253 -48
sglang/test/test_utils.py +27 -7
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/METADATA +2 -1
sglang-0.4.0.dist-info/RECORD +184 -0
sglang/srt/layers/fused_moe_grok/__init__.py +0 -1
sglang/srt/layers/fused_moe_grok/fused_moe.py +0 -692
sglang/srt/layers/fused_moe_grok/layer.py +0 -630
sglang-0.3.6.post2.dist-info/RECORD +0 -164
{sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/LICENSE +0 -0
{sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/WHEEL +0 -0
{sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/top_level.txt +0 -0

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import logging
 import os
+import signal
 import threading
 import time
 import warnings
@@ -23,6 +24,7 @@ from concurrent import futures
 from types import SimpleNamespace
 from typing import List, Optional
+import psutil
 import torch
 import zmq
@@ -36,15 +38,19 @@ from sglang.srt.managers.io_struct import (
     BatchTokenIDOut,
     CloseSessionReqInput,
     FlushCacheReq,
-    GetMemPoolSizeReq,
-    GetMemPoolSizeReqOutput,
+    GetWeightsByNameReqInput,
+    GetWeightsByNameReqOutput,
+    InitWeightsUpdateGroupReqInput,
+    InitWeightsUpdateGroupReqOutput,
     OpenSessionReqInput,
     OpenSessionReqOutput,
     ProfileReq,
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
-    UpdateWeightReqInput,
-    UpdateWeightReqOutput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightFromDiskReqOutput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromDistributedReqOutput,
 )
 from sglang.srt.managers.schedule_batch import (
     FINISH_ABORT,
@@ -73,7 +79,6 @@ from sglang.srt.utils import (
     crash_on_warnings,
     get_bool_env_var,
     get_zmq_socket,
-    kill_parent_process,
     set_gpu_proc_affinity,
     set_random_seed,
     suppress_other_loggers,
@@ -142,9 +147,12 @@ class Scheduler:
         self.model_config = ModelConfig(
             server_args.model_path,
             trust_remote_code=server_args.trust_remote_code,
+            revision=server_args.revision,
             context_length=server_args.context_length,
             model_override_args=server_args.json_model_override_args,
             is_embedding=server_args.is_embedding,
+            dtype=server_args.dtype,
+            quantization=server_args.quantization,
         )
         self.is_generation = self.model_config.is_generation
@@ -170,6 +178,10 @@ class Scheduler:
             self.enable_overlap = False
             logger.info("Overlap scheduler is disabled for embedding models.")
+        if self.model_config.is_multimodal:
+            self.enable_overlap = False
+            logger.info("Overlap scheduler is disabled for multimodal models.")
         if self.enable_overlap:
             self.disable_jump_forward = True
@@ -250,6 +262,8 @@ class Scheduler:
         # Init chunked prefill
         self.chunked_prefill_size = server_args.chunked_prefill_size
+        if self.chunked_prefill_size <= 0:  # -1 means disable
+            self.chunked_prefill_size = None
         self.being_chunked_req = None
         self.is_mixed_chunk = (
             self.chunked_prefill_size is not None and server_args.enable_mixed_chunk
@@ -312,6 +326,7 @@ class Scheduler:
         self.watchdog_timeout = server_args.watchdog_timeout
         t = threading.Thread(target=self.watchdog_thread, daemon=True)
         t.start()
+        self.parent_process = psutil.Process().parent()
         # Init profiler
         if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "":
@@ -355,7 +370,7 @@ class Scheduler:
                     self.watchdog_last_time = time.time()
             time.sleep(self.watchdog_timeout / 2)
-        kill_parent_process()
+        self.parent_process.send_signal(signal.SIGQUIT)
     @torch.no_grad()
     def event_loop_normal(self):
@@ -500,11 +515,27 @@ class Scheduler:
                 self.flush_cache()
             elif isinstance(recv_req, AbortReq):
                 self.abort_request(recv_req)
-            elif isinstance(recv_req, UpdateWeightReqInput):
-                success, message = self.update_weights(recv_req)
+            elif isinstance(recv_req, UpdateWeightFromDiskReqInput):
+                success, message = self.update_weights_from_disk(recv_req)
+                self.send_to_tokenizer.send_pyobj(
+                    UpdateWeightFromDiskReqOutput(success, message)
+                )
+            elif isinstance(recv_req, GetWeightsByNameReqInput):
+                parameter = self.get_weights_by_name(recv_req)
+                self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
+            elif isinstance(recv_req, InitWeightsUpdateGroupReqInput):
+                success, message = self.init_weights_update_group(recv_req)
                 self.send_to_tokenizer.send_pyobj(
-                    UpdateWeightReqOutput(success, message)
+                    InitWeightsUpdateGroupReqOutput(success, message)
                 )
+            elif isinstance(recv_req, UpdateWeightsFromDistributedReqInput):
+                success, message = self.update_weights_from_distributed(recv_req)
+                self.send_to_tokenizer.send_pyobj(
+                    UpdateWeightsFromDistributedReqOutput(success, message)
+                )
+            elif isinstance(recv_req, GetWeightsByNameReqInput):
+                parameter = self.get_weights_by_name(recv_req)
+                self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
             elif isinstance(recv_req, ProfileReq):
                 if recv_req == ProfileReq.START_PROFILE:
                     self.start_profile()
@@ -515,10 +546,6 @@ class Scheduler:
                 self.send_to_tokenizer.send_pyobj(OpenSessionReqOutput(session_id))
             elif isinstance(recv_req, CloseSessionReqInput):
                 self.close_session(recv_req)
-            elif isinstance(recv_req, GetMemPoolSizeReq):
-                self.send_to_tokenizer.send_pyobj(
-                    GetMemPoolSizeReqOutput(self.max_total_num_tokens)
-                )
             else:
                 raise ValueError(f"Invalid request: {recv_req}")
@@ -526,8 +553,9 @@ class Scheduler:
         self,
         recv_req: TokenizedGenerateReqInput,
     ):
+        # Create a new request
         if recv_req.session_id is None or recv_req.session_id not in self.sessions:
-            # Create a new request
             if recv_req.input_embeds is not None:
                 # Generate fake input_ids based on the length of input_embeds
                 seq_length = len(recv_req.input_embeds)
@@ -558,25 +586,30 @@ class Scheduler:
                 self.waiting_queue.append(req)
                 return
-        # Image inputs
+        # Handle image inputs
         if recv_req.image_inputs is not None:
-            image_inputs = ImageInputs.from_dict(
-                recv_req.image_inputs, self.model_config.vocab_size
-            )
+            image_inputs = ImageInputs.from_dict(recv_req.image_inputs)
+            # Expand a single image token into multiple dummy tokens for receiving image embeddings
             req.origin_input_ids = self.pad_input_ids_func(
                 req.origin_input_ids, image_inputs
             )
-            req.extend_image_inputs(image_inputs, self.model_config.vocab_size)
+            req.extend_image_inputs(image_inputs)
-            if len(req.origin_input_ids) > self.max_req_input_len:
-                req.finished_reason = FINISH_ABORT(
-                    "Image request length is longer than the KV cache pool size or "
-                    "the max context length aborting because you cannot truncate the image embeds"
+            if len(req.origin_input_ids) >= self.max_req_input_len:
+                logger.error(
+                    "Multimodal prompt is too long after expanding multimodal tokens. "
+                    f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}. "
                 )
+                req.origin_input_ids = [0]
+                req.image_inputs = None
                 req.sampling_params.max_new_tokens = 0
+                req.finished_reason = FINISH_ABORT(
+                    "Multimodal prompt is too long. Check server logs for details."
+                )
                 self.waiting_queue.append(req)
                 return
+        # Copy more attributes
         req.return_logprob = recv_req.return_logprob
         req.top_logprobs_num = recv_req.top_logprobs_num
         req.stream = recv_req.stream
@@ -647,7 +680,7 @@ class Scheduler:
         self.waiting_queue.append(req)
-    def log_prefill_stats(self, adder, can_run_list, running_bs, has_inflight):
+    def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
         if isinstance(self.tree_cache, RadixCache):
             self.tree_cache_metrics["total"] += (
                 adder.log_input_tokens + adder.log_hit_tokens
@@ -671,14 +704,14 @@ class Scheduler:
             f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
             f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
             f"#running-req: {running_bs}, "
-            f"#queue-req: {len(self.waiting_queue) + has_inflight}"
+            f"#queue-req: {len(self.waiting_queue) + has_being_chunked}"
         )
         if self.enable_metrics:
             self.stats.num_running_reqs = running_bs
             self.stats.num_used_tokens = num_used
             self.stats.token_usage = round(num_used / self.max_total_num_tokens, 2)
-            self.stats.num_queue_reqs = len(self.waiting_queue) + has_inflight
+            self.stats.num_queue_reqs = len(self.waiting_queue) + has_being_chunked
             self.stats.cache_hit_rate = tree_cache_hit_rate
             self.metrics_collector.log_stats(self.stats)
@@ -739,7 +772,7 @@ class Scheduler:
                 # Move the chunked request out of the batch
                 self.last_batch.filter_batch(being_chunked_req=self.being_chunked_req)
                 self.tree_cache.cache_unfinished_req(self.being_chunked_req)
-                # Inflight request keeps its rid but will get a new req_pool_idx
+                # being chunked request keeps its rid but will get a new req_pool_idx
                 self.req_to_token_pool.free(self.being_chunked_req.req_pool_idx)
                 self.batch_is_full = False
@@ -790,10 +823,10 @@ class Scheduler:
             running_bs if self.is_mixed_chunk else 0,
         )
-        has_inflight = self.being_chunked_req is not None
-        if has_inflight:
+        has_being_chunked = self.being_chunked_req is not None
+        if has_being_chunked:
             self.being_chunked_req.init_next_round_input()
-            self.being_chunked_req = adder.add_inflight_req(self.being_chunked_req)
+            self.being_chunked_req = adder.add_being_chunked_req(self.being_chunked_req)
         if self.lora_paths:
             lora_set = (
@@ -835,16 +868,16 @@ class Scheduler:
             x for x in self.waiting_queue if x not in set(can_run_list)
         ]
-        if adder.new_inflight_req is not None:
+        if adder.new_being_chunked_req is not None:
             assert self.being_chunked_req is None
-            self.being_chunked_req = adder.new_inflight_req
+            self.being_chunked_req = adder.new_being_chunked_req
         if self.being_chunked_req:
             self.being_chunked_req.is_being_chunked += 1
         # Print stats
         if self.tp_rank == 0:
-            self.log_prefill_stats(adder, can_run_list, running_bs, has_inflight)
+            self.log_prefill_stats(adder, can_run_list, running_bs, has_being_chunked)
         # Create a new batch
         new_batch = ScheduleBatch.init_new(
@@ -1017,7 +1050,7 @@ class Scheduler:
                     if req.grammar is not None:
                         req.grammar.accept_token(next_token_id)
                 else:
-                    # Inflight reqs' prefill is not finished
+                    # being chunked reqs' prefill is not finished
                     req.is_being_chunked -= 1
             if batch.next_batch_sampling_info:
@@ -1045,7 +1078,7 @@ class Scheduler:
                     else:
                         self.tree_cache.cache_unfinished_req(req)
                 else:
-                    # Inflight reqs' prefill is not finished
+                    # being chunked reqs' prefill is not finished
                     req.is_being_chunked -= 1
         self.stream_output(batch.reqs)
@@ -1140,6 +1173,14 @@ class Scheduler:
                 + 1 : len(req.fill_ids)
                 - req.last_update_decode_tokens
             ]
+            # Clip the padded hash values from image tokens.
+            # Otherwise, it will lead to detokenization errors.
+            input_token_ids = [
+                x if x < self.model_config.vocab_size - 1 else 0
+                for x in input_token_ids
+            ]
             req.input_token_logprobs = list(zip(input_token_logprobs, input_token_ids))
             if (
@@ -1344,18 +1385,20 @@ class Scheduler:
         if to_del is not None:
             del self.waiting_queue[to_del]
+            logger.debug(f"Abort queued request. {req.rid=}")
+            return
         # Delete requests in the running batch
         if self.running_batch:
             for req in self.running_batch.reqs:
                 if req.rid == recv_req.rid and not req.finished():
-                    req.finished_reason = FINISH_ABORT()
-                    self.tree_cache.cache_finished_req(req)
+                    logger.debug(f"Abort running request. {req.rid=}")
+                    req.to_abort = True
                     break
-    def update_weights(self, recv_req: UpdateWeightReqInput):
-        """In-place update of the weights."""
-        success, message = self.tp_worker.update_weights(recv_req)
+    def update_weights_from_disk(self, recv_req: UpdateWeightFromDiskReqInput):
+        """In-place update of the weights from disk."""
+        success, message = self.tp_worker.update_weights_from_disk(recv_req)
         if success:
             flash_cache_success = self.flush_cache()
             assert flash_cache_success, "Cache flush failed after updating weights"
@@ -1363,6 +1406,27 @@ class Scheduler:
             logger.error(message)
         return success, message
+    def init_weights_update_group(self, recv_req: InitWeightsUpdateGroupReqInput):
+        """Initialize the online model parameter update group."""
+        success, message = self.tp_worker.init_weights_update_group(recv_req)
+        return success, message
+    def update_weights_from_distributed(
+        self, recv_req: UpdateWeightsFromDistributedReqInput
+    ):
+        """Update the online model parameter."""
+        success, message = self.tp_worker.update_weights_from_distributed(recv_req)
+        if success:
+            flash_cache_success = self.flush_cache()
+            assert flash_cache_success, "Cache flush failed after updating weights"
+        else:
+            logger.error(message)
+        return success, message
+    def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
+        parameter = self.tp_worker.get_weights_by_name(recv_req)
+        return parameter
     def start_profile(self) -> None:
         if self.profiler is None:
             raise RuntimeError("Profiler is not enabled.")
@@ -1409,9 +1473,9 @@ def run_scheduler_process(
     if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
         set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
-    # [For Router] if env var "DP_RANK" exist, set dp_rank to the value of the env var
-    if dp_rank is None and "DP_RANK" in os.environ:
-        dp_rank = int(os.environ["DP_RANK"])
+    # [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
+    if dp_rank is None and "SGLANG_DP_RANK" in os.environ:
+        dp_rank = int(os.environ["SGLANG_DP_RANK"])
     if dp_rank is None:
         configure_logger(server_args, prefix=f" TP{tp_rank}")
@@ -1419,6 +1483,7 @@ def run_scheduler_process(
         configure_logger(server_args, prefix=f" DP{dp_rank} TP{tp_rank}")
     suppress_other_loggers()
+    parent_process = psutil.Process().parent()
     try:
         scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)
@@ -1430,6 +1495,6 @@ def run_scheduler_process(
         else:
             scheduler.event_loop_normal()
     except Exception:
-        msg = get_exception_traceback()
-        logger.error(msg)
-        kill_parent_process()
+        traceback = get_exception_traceback()
+        logger.error(f"Scheduler hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)

sglang/srt/managers/session_controller.py CHANGED Viewed

@@ -10,10 +10,7 @@
 # limitations under the License.
 # ==============================================================================
-import copy
 import uuid
-from dataclasses import dataclass
-from typing import Optional
 from sglang.srt.managers.io_struct import TokenizedGenerateReqInput
 from sglang.srt.managers.schedule_batch import FINISH_ABORT, List, Req

sglang 0.3.6.post2__py3-none-any.whl → 0.4.0__py3-none-any.whl

sglang 0.3.6.post2py3-none-any.whl → 0.4.0py3-none-any.whl