PyPI - sglang - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl - Mend

sglang 0.2.6py3-none-any.whl → 0.2.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

sglang/__init__.py +33 -26
sglang/api.py +9 -1
sglang/bench_latency.py +2 -2
sglang/bench_serving.py +10 -1
sglang/check_env.py +1 -1
sglang/lang/backend/litellm.py +1 -1
sglang/lang/backend/openai.py +1 -1
sglang/lang/interpreter.py +20 -5
sglang/lang/ir.py +1 -1
sglang/srt/constrained/__init__.py +15 -0
sglang/srt/constrained/base_cache.py +15 -0
sglang/srt/constrained/fsm_cache.py +15 -0
sglang/srt/constrained/jump_forward.py +15 -0
sglang/srt/conversation.py +26 -0
sglang/srt/hf_transformers_utils.py +15 -0
sglang/srt/layers/context_flashattention_nopad.py +15 -0
sglang/srt/layers/extend_attention.py +15 -0
sglang/srt/layers/fused_moe.py +15 -0
sglang/srt/layers/linear.py +15 -0
sglang/srt/layers/logits_processor.py +41 -13
sglang/srt/layers/quantization/__init__.py +15 -0
sglang/srt/layers/quantization/fp8.py +15 -0
sglang/srt/layers/radix_attention.py +17 -2
sglang/srt/layers/token_attention.py +16 -1
sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
sglang/srt/managers/detokenizer_manager.py +16 -1
sglang/srt/managers/io_struct.py +36 -3
sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +31 -12
sglang/srt/managers/tokenizer_manager.py +39 -16
sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +130 -40
sglang/srt/mem_cache/flush_cache.py +33 -0
sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
sglang/srt/{managers/controller → mem_cache}/radix_cache.py +15 -0
sglang/srt/mm_utils.py +15 -0
sglang/srt/model_config.py +15 -0
sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +16 -1
sglang/srt/{managers/controller → model_executor}/model_runner.py +32 -12
sglang/srt/model_loader/model_loader.py +15 -0
sglang/srt/model_loader/utils.py +16 -1
sglang/srt/models/chatglm.py +16 -1
sglang/srt/models/commandr.py +16 -1
sglang/srt/models/dbrx.py +16 -1
sglang/srt/models/deepseek.py +16 -1
sglang/srt/models/deepseek_v2.py +16 -1
sglang/srt/models/gemma.py +16 -1
sglang/srt/models/gemma2.py +16 -1
sglang/srt/models/gpt_bigcode.py +16 -1
sglang/srt/models/grok.py +16 -1
sglang/srt/models/internlm2.py +16 -1
sglang/srt/models/llama2.py +16 -1
sglang/srt/models/llama_classification.py +16 -1
sglang/srt/models/llava.py +17 -2
sglang/srt/models/llavavid.py +17 -2
sglang/srt/models/minicpm.py +16 -1
sglang/srt/models/mistral.py +15 -0
sglang/srt/models/mixtral.py +16 -1
sglang/srt/models/mixtral_quant.py +16 -1
sglang/srt/models/qwen.py +16 -1
sglang/srt/models/qwen2.py +16 -1
sglang/srt/models/qwen2_moe.py +16 -1
sglang/srt/models/stablelm.py +16 -1
sglang/srt/models/yivl.py +15 -0
sglang/srt/openai_api/adapter.py +520 -135
sglang/srt/openai_api/protocol.py +64 -0
sglang/srt/sampling_params.py +15 -0
sglang/srt/server.py +89 -23
sglang/srt/server_args.py +49 -11
sglang/srt/utils.py +15 -0
sglang/utils.py +22 -0
sglang/version.py +1 -1
{sglang-0.2.6.dist-info → sglang-0.2.7.dist-info}/METADATA +32 -6
sglang-0.2.7.dist-info/RECORD +93 -0
{sglang-0.2.6.dist-info → sglang-0.2.7.dist-info}/WHEEL +1 -1
sglang/srt/flush_cache.py +0 -18
sglang-0.2.6.dist-info/RECORD +0 -93
{sglang-0.2.6.dist-info → sglang-0.2.7.dist-info}/LICENSE +0 -0
{sglang-0.2.6.dist-info → sglang-0.2.7.dist-info}/top_level.txt +0 -0

sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """A tensor parallel worker."""
 import logging
@@ -14,23 +29,23 @@ from sglang.global_config import global_config
 from sglang.srt.constrained.fsm_cache import FSMCache
 from sglang.srt.constrained.jump_forward import JumpForwardCache
 from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer
-from sglang.srt.managers.controller.infer_batch import (
-    FINISH_ABORT,
-    BaseFinishReason,
-    Batch,
-    ForwardMode,
-    Req,
-)
-from sglang.srt.managers.controller.model_runner import ModelRunner
-from sglang.srt.managers.controller.radix_cache import RadixCache
-from sglang.srt.managers.controller.schedule_heuristic import ScheduleHeuristic
 from sglang.srt.managers.io_struct import (
     AbortReq,
     BatchTokenIDOut,
     FlushCacheReq,
     TokenizedGenerateReqInput,
 )
+from sglang.srt.managers.policy_scheduler import PolicyScheduler
+from sglang.srt.managers.schedule_batch import (
+    FINISH_ABORT,
+    BaseFinishReason,
+    Batch,
+    ForwardMode,
+    Req,
+)
+from sglang.srt.mem_cache.radix_cache import RadixCache
 from sglang.srt.model_config import ModelConfig
+from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import (
     get_int_token_logit_bias,
@@ -40,7 +55,7 @@ from sglang.srt.utils import (
 )
 from sglang.utils import get_exception_traceback
-logger = logging.getLogger("srt.tp_worker")
+logger = logging.getLogger(__name__)
 class ModelTpServer:
@@ -59,9 +74,13 @@ class ModelTpServer:
         self.tp_rank = tp_rank
         self.tp_size = server_args.tp_size
         self.dp_size = server_args.dp_size
-        self.schedule_heuristic = server_args.schedule_heuristic
+        self.schedule_policy = server_args.schedule_policy
         self.disable_regex_jump_forward = server_args.disable_regex_jump_forward
+        # Chunked prefill
+        self.chunked_prefill_size = server_args.chunked_prefill_size
+        self.current_inflight_req = None
         # Init model and tokenizer
         self.model_config = ModelConfig(
             server_args.model_path,
@@ -117,7 +136,7 @@ class ModelTpServer:
         # Print info
         logger.info(
-            f"[gpu_id={self.gpu_id}] "
+            f"[gpu={self.gpu_id}] "
             f"max_total_num_tokens={self.max_total_num_tokens}, "
             f"max_prefill_tokens={self.max_prefill_tokens}, "
             f"max_running_requests={self.max_running_requests}, "
@@ -131,8 +150,8 @@ class ModelTpServer:
             disable=server_args.disable_radix_cache,
         )
         self.tree_cache_metrics = {"total": 0, "hit": 0}
-        self.scheduler = ScheduleHeuristic(
-            self.schedule_heuristic,
+        self.scheduler = PolicyScheduler(
+            self.schedule_policy,
             self.max_running_requests,
             self.max_prefill_tokens,
             self.max_total_num_tokens,
@@ -142,7 +161,7 @@ class ModelTpServer:
         self.token_to_kv_pool = self.model_runner.token_to_kv_pool
         # Init running status
-        self.forward_queue: List[Req] = []
+        self.waiting_queue: List[Req] = []
         self.running_batch: Batch = None
         self.out_pyobjs = []
         self.decode_forward_ct = 0
@@ -205,6 +224,7 @@ class ModelTpServer:
             # Run a new prefill batch
             self.forward_prefill_batch(new_batch)
             self.cache_filled_batch(new_batch)
+            self.filter_out_inflight(new_batch)
             if not new_batch.is_empty():
                 if self.running_batch is None:
@@ -241,12 +261,12 @@ class ModelTpServer:
         self.num_generated_tokens = 0
         self.last_stats_tic = time.time()
         logger.info(
-            f"[gpu_id={self.gpu_id}] Decode batch. "
+            f"[gpu={self.gpu_id}] Decode batch. "
             f"#running-req: {len(self.running_batch.reqs)}, "
             f"#token: {num_used}, "
             f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
             f"gen throughput (token/s): {throughput:.2f}, "
-            f"#queue-req: {len(self.forward_queue)}"
+            f"#queue-req: {len(self.waiting_queue)}"
         )
     def check_memory(self):
@@ -313,9 +333,10 @@ class ModelTpServer:
             ),
             self.max_req_input_len - 1 - len(req.origin_input_ids),
         )
-        self.forward_queue.append(req)
+        self.waiting_queue.append(req)
     def get_new_prefill_batch(self) -> Optional[Batch]:
+        # TODO(lsyin): organize this function
         running_bs = (
             len(self.running_batch.reqs) if self.running_batch is not None else 0
         )
@@ -323,7 +344,7 @@ class ModelTpServer:
             return
         # Compute matched prefix length
-        for req in self.forward_queue:
+        for req in self.waiting_queue:
             req.input_ids = req.origin_input_ids + req.output_ids
             prefix_indices, last_node = self.tree_cache.match_prefix(req.input_ids)
             if req.return_logprob:
@@ -333,7 +354,7 @@ class ModelTpServer:
             req.last_node = last_node
         # Get priority queue
-        self.forward_queue = self.scheduler.get_priority_queue(self.forward_queue)
+        self.waiting_queue = self.scheduler.get_priority_queue(self.waiting_queue)
         # Add requests if there is available space
         can_run_list = []
@@ -352,7 +373,33 @@ class ModelTpServer:
                 ]
             )
-        for req in self.forward_queue:
+        # Handle the current inflight request
+        take_inflight = 0
+        if self.current_inflight_req:
+            take_inflight = 1
+            r = self.current_inflight_req
+            r.input_ids = r.origin_input_ids + r.output_ids
+            truncated = (
+                len(r.input_ids) - len(r.prefix_indices) > self.chunked_prefill_size
+            )
+            r.extend_input_len = min(
+                len(r.input_ids) - len(r.prefix_indices), self.chunked_prefill_size
+            )
+            r.input_ids = r.input_ids[: len(r.prefix_indices) + r.extend_input_len]
+            can_run_list.append(r)
+            if not truncated:
+                # Finish inflight
+                self.current_inflight_req = None
+                new_batch_total_tokens += (
+                    r.extend_input_len + r.sampling_params.max_new_tokens
+                )
+                new_batch_input_tokens += r.extend_input_len
+            else:
+                new_batch_total_tokens += r.extend_input_len
+                new_batch_input_tokens += r.extend_input_len
+        for req in self.waiting_queue:
             if req.return_logprob and req.normalized_prompt_logprob is None:
                 # Need at least two tokens to compute normalized logprob
                 if req.extend_input_len < 2:
@@ -394,11 +441,39 @@ class ModelTpServer:
                     break
                 else:
                     # Add this request to the running batch
-                    can_run_list.append(req)
-                    new_batch_total_tokens += (
-                        req.extend_input_len + req.sampling_params.max_new_tokens
-                    )
-                    new_batch_input_tokens += req.extend_input_len
+                    if (
+                        self.chunked_prefill_size is None
+                        or (
+                            new_batch_input_tokens + req.extend_input_len
+                            <= self.chunked_prefill_size
+                        )
+                        or (
+                            req.return_logprob and req.normalized_prompt_logprob is None
+                        )
+                    ):
+                        can_run_list.append(req)
+                        new_batch_total_tokens += (
+                            req.extend_input_len + req.sampling_params.max_new_tokens
+                        )
+                        new_batch_input_tokens += req.extend_input_len
+                    else:
+                        trunc_len = self.chunked_prefill_size - new_batch_input_tokens
+                        if trunc_len <= 0:
+                            # Undo locking
+                            delta = self.tree_cache.dec_lock_ref(req.last_node)
+                            available_size += delta
+                            break
+                        req.extend_input_len = trunc_len
+                        req.input_ids = req.input_ids[
+                            : len(req.prefix_indices) + req.extend_input_len
+                        ]
+                        can_run_list.append(req)
+                        self.current_inflight_req = req
+                        new_batch_input_tokens += req.extend_input_len
+                        new_batch_total_tokens += req.extend_input_len
+                        break
             else:
                 break
@@ -419,13 +494,13 @@ class ModelTpServer:
                 self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
             )
             logger.info(
-                f"[gpu_id={self.gpu_id}] Prefill batch. "
+                f"[gpu={self.gpu_id}] Prefill batch. "
                 f"#new-seq: {len(can_run_list)}, "
                 f"#new-token: {new_batch_input_tokens}, "
                 f"#cached-token: {hit_tokens}, "
                 f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
                 f"#running-req: {running_bs}, "
-                f"#queue-req: {len(self.forward_queue) - len(can_run_list)}"
+                f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + take_inflight}"
             )
         # Return the new batch
@@ -435,7 +510,7 @@ class ModelTpServer:
             self.token_to_kv_pool,
             self.tree_cache,
         )
-        self.forward_queue = [x for x in self.forward_queue if x not in can_run_list]
+        self.waiting_queue = [x for x in self.waiting_queue if x not in can_run_list]
         return new_batch
     def forward_prefill_batch(self, batch: Batch):
@@ -467,9 +542,10 @@ class ModelTpServer:
         # Check finish conditions
         pt = 0
         for i, req in enumerate(batch.reqs):
-            req.completion_tokens_wo_jump_forward += 1
-            req.output_ids.append(next_token_ids[i])
-            req.check_finished()
+            if req is not self.current_inflight_req:
+                req.completion_tokens_wo_jump_forward += 1
+                req.output_ids.append(next_token_ids[i])
+                req.check_finished()
             if req.return_logprob:
                 self.add_logprob_return_values(i, req, pt, next_token_ids, output)
@@ -530,7 +606,7 @@ class ModelTpServer:
         req_pool_indices_cpu = batch.req_pool_indices.cpu().numpy()
         for i, req in enumerate(batch.reqs):
             new_prefix_indices, new_last_node = self.tree_cache.cache_req(
-                token_ids=tuple(req.origin_input_ids + req.output_ids)[:-1],
+                token_ids=tuple(req.input_ids),
                 last_uncached_pos=len(req.prefix_indices),
                 req_pool_idx=req_pool_indices_cpu[i],
                 del_in_memory_pool=False,
@@ -538,6 +614,10 @@ class ModelTpServer:
             )
             req.prefix_indices, req.last_node = new_prefix_indices, new_last_node
+            if req is self.current_inflight_req:
+                # inflight request would get a new req idx
+                self.req_to_token_pool.free(int(req_pool_indices_cpu[i]))
     def forward_decode_batch(self, batch: Batch):
         # Check if decode out of memory
         if not batch.check_decode_mem():
@@ -551,7 +631,7 @@ class ModelTpServer:
                 f"#retracted_reqs: {len(retracted_reqs)}, "
                 f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
             )
-            self.forward_queue.extend(retracted_reqs)
+            self.waiting_queue.extend(retracted_reqs)
         else:
             self.new_token_ratio = max(
                 self.new_token_ratio - self.new_token_ratio_decay,
@@ -561,7 +641,7 @@ class ModelTpServer:
         if not self.disable_regex_jump_forward:
             # Check for jump-forward
             jump_forward_reqs = batch.check_for_jump_forward(self.model_runner)
-            self.forward_queue.extend(jump_forward_reqs)
+            self.waiting_queue.extend(jump_forward_reqs)
             if batch.is_empty():
                 return
@@ -696,8 +776,18 @@ class ModelTpServer:
             else:
                 batch.reqs = []
+    def filter_out_inflight(self, batch: Batch):
+        # TODO(lsyin): reduce the overhead, make a special version for this
+        if self.current_inflight_req is None:
+            return
+        to_remove = batch.reqs.index(self.current_inflight_req)
+        unfinished_indices = [i for i in range(len(batch.reqs)) if i != to_remove]
+        batch.filter_batch(unfinished_indices)
     def flush_cache(self):
-        if len(self.forward_queue) == 0 and (
+        if len(self.waiting_queue) == 0 and (
             self.running_batch is None or len(self.running_batch.reqs) == 0
         ):
             self.tree_cache.reset()
@@ -710,20 +800,20 @@ class ModelTpServer:
         else:
             warnings.warn(
                 f"Cache not flushed because there are pending requests. "
-                f"#queue-req: {len(self.forward_queue)}, "
+                f"#queue-req: {len(self.waiting_queue)}, "
                 f"#running-req: {0 if self.running_batch is None else len(self.running_batch.reqs)}"
             )
     def abort_request(self, recv_req):
         # Delete requests in the waiting queue
         to_del = None
-        for i, req in enumerate(self.forward_queue):
+        for i, req in enumerate(self.waiting_queue):
             if req.rid == recv_req.rid:
                 to_del = i
                 break
         if to_del is not None:
-            del self.forward_queue[to_del]
+            del self.waiting_queue[to_del]
         # Delete requests in the running batch
         if self.running_batch:

sglang/srt/mem_cache/flush_cache.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""
+Flush the KV cache.
+Usage:
+python3 -m sglang.srt.mem_cache.flush_cache --url http://localhost:30000
+"""
+import argparse
+import requests
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--url", type=str, default="http://localhost:30000")
+    args = parser.parse_args()
+    response = requests.get(args.url + "/flush_cache")
+    assert response.status_code == 200

sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Memory pool."""
 import logging
@@ -30,7 +45,7 @@ class ReqToTokenPool:
         return select_index
-    def free(self, free_index: int):
+    def free(self, free_index):
         self.mem_state[free_index] = True
         if isinstance(free_index, (int,)):
             self.can_use_mem_size += 1

sglang/srt/{managers/controller → mem_cache}/radix_cache.py RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """
 The radix tree data structure for managing the KV cache.
 """

sglang/srt/mm_utils.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 # Source: https://github.com/haotian-liu/LLaVA/blob/main/llava/mm_utils.py
 import ast
 import base64

sglang/srt/model_config.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 from typing import Optional
 from transformers import PretrainedConfig

sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Run the model with cuda graph."""
 import bisect
@@ -14,7 +29,7 @@ from sglang.srt.layers.logits_processor import (
     LogitsMetadata,
     LogitsProcessor,
 )
-from sglang.srt.managers.controller.infer_batch import (
+from sglang.srt.managers.schedule_batch import (
     Batch,
     ForwardMode,
     InputMetadata,

sglang/srt/{managers/controller → model_executor}/model_runner.py RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """ModelRunner runs the forward passes of the models."""
 import importlib
@@ -25,13 +40,13 @@ from vllm.distributed import (
 from vllm.model_executor.models import ModelRegistry
 from sglang.global_config import global_config
-from sglang.srt.managers.controller.infer_batch import (
+from sglang.srt.managers.schedule_batch import (
     Batch,
     ForwardMode,
     InputMetadata,
     global_server_args_dict,
 )
-from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPool
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import (
     get_available_gpu_memory,
@@ -42,7 +57,7 @@ from sglang.srt.utils import (
     monkey_patch_vllm_qvk_linear_loader,
 )
-logger = logging.getLogger("srt.model_runner")
+logger = logging.getLogger(__name__)
 class ModelRunner:
@@ -75,7 +90,7 @@ class ModelRunner:
         # Init torch distributed
         torch.cuda.set_device(self.gpu_id)
-        logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.")
+        logger.info(f"[gpu={self.gpu_id}] Init nccl begin.")
         if not server_args.enable_p2p_check:
             monkey_patch_vllm_p2p_access_check(self.gpu_id)
@@ -115,7 +130,7 @@ class ModelRunner:
     def load_model(self):
         logger.info(
-            f"[gpu_id={self.gpu_id}] Load weight begin. "
+            f"[gpu={self.gpu_id}] Load weight begin. "
             f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
         )
@@ -163,7 +178,7 @@ class ModelRunner:
             cache_config=None,
         )
         logger.info(
-            f"[gpu_id={self.gpu_id}] Load weight end. "
+            f"[gpu={self.gpu_id}] Load weight end. "
             f"type={type(self.model).__name__}, "
             f"dtype={self.dtype}, "
             f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
@@ -197,9 +212,14 @@ class ModelRunner:
             )
         if max_num_reqs is None:
-            max_num_reqs = max(
-                int(self.max_total_num_tokens / self.model_config.context_len * 512),
-                2048,
+            max_num_reqs = min(
+                max(
+                    int(
+                        self.max_total_num_tokens / self.model_config.context_len * 512
+                    ),
+                    2048,
+                ),
+                5120,
             )
         self.req_to_token_pool = ReqToTokenPool(
@@ -214,7 +234,7 @@ class ModelRunner:
             layer_num=self.model_config.num_hidden_layers,
         )
         logger.info(
-            f"[gpu_id={self.gpu_id}] Memory pool end. "
+            f"[gpu={self.gpu_id}] Memory pool end. "
             f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
         )
@@ -258,14 +278,14 @@ class ModelRunner:
         )
     def init_cuda_graphs(self):
-        from sglang.srt.managers.controller.cuda_graph_runner import CudaGraphRunner
+        from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
         if self.server_args.disable_cuda_graph or self.server_args.disable_flashinfer:
             self.cuda_graph_runner = None
             return
         logger.info(
-            f"[gpu_id={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes."
+            f"[gpu={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes."
         )
         batch_size_list = [1, 2, 4] + [i * 8 for i in range(1, 17)]
         self.cuda_graph_runner = CudaGraphRunner(

sglang/srt/model_loader/model_loader.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 # temporarily adapted from https://github.com/vllm-project/vllm/blob/10383887e03412196a2689b9398290719c4797bf/vllm/model_executor/model_loader/loader.py
 # FIXME: in progress of refactoring the model loader

sglang/srt/model_loader/utils.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 # temporarily adapted from vLLM
 # FIXME: in progress of refactoring the model loader
 """Utilities for selecting and loading models."""
@@ -23,7 +38,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConf
 from sglang.srt.layers.quantization import get_quantization_config
-logger = logging.getLogger("srt.model_loader")
+logger = logging.getLogger(__name__)
 temp_dir = tempfile.gettempdir()

sglang 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl

sglang 0.2.6py3-none-any.whl → 0.2.7py3-none-any.whl