PyPI - sglang - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl - Mend

sglang 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

sglang/__init__.py +33 -26
sglang/api.py +9 -1
sglang/bench_latency.py +2 -2
sglang/bench_serving.py +10 -1
sglang/check_env.py +1 -1
sglang/lang/backend/litellm.py +1 -1
sglang/lang/backend/openai.py +1 -1
sglang/lang/backend/runtime_endpoint.py +4 -4
sglang/lang/interpreter.py +24 -9
sglang/lang/ir.py +1 -1
sglang/srt/constrained/__init__.py +15 -0
sglang/srt/constrained/base_cache.py +15 -0
sglang/srt/constrained/fsm_cache.py +36 -1
sglang/srt/constrained/jump_forward.py +15 -0
sglang/srt/conversation.py +26 -0
sglang/srt/hf_transformers_utils.py +18 -1
sglang/srt/layers/context_flashattention_nopad.py +15 -0
sglang/srt/layers/extend_attention.py +15 -0
sglang/srt/layers/fused_moe.py +15 -0
sglang/srt/layers/linear.py +15 -0
sglang/srt/layers/logits_processor.py +109 -72
sglang/srt/layers/quantization/__init__.py +15 -0
sglang/srt/layers/quantization/fp8.py +15 -0
sglang/srt/layers/radix_attention.py +21 -3
sglang/srt/layers/token_attention.py +16 -1
sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
sglang/srt/managers/detokenizer_manager.py +16 -1
sglang/srt/managers/io_struct.py +38 -5
sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +85 -25
sglang/srt/managers/tokenizer_manager.py +99 -57
sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +177 -81
sglang/srt/mem_cache/flush_cache.py +33 -0
sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
sglang/srt/{managers/controller → mem_cache}/radix_cache.py +15 -0
sglang/srt/mm_utils.py +15 -0
sglang/srt/model_config.py +20 -0
sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +42 -18
sglang/srt/{managers/controller → model_executor}/model_runner.py +51 -16
sglang/srt/model_loader/model_loader.py +15 -0
sglang/srt/model_loader/utils.py +16 -1
sglang/srt/models/chatglm.py +16 -1
sglang/srt/models/commandr.py +16 -1
sglang/srt/models/dbrx.py +16 -1
sglang/srt/models/deepseek.py +16 -1
sglang/srt/models/deepseek_v2.py +532 -0
sglang/srt/models/gemma.py +16 -1
sglang/srt/models/gemma2.py +16 -1
sglang/srt/models/gpt_bigcode.py +16 -1
sglang/srt/models/grok.py +16 -1
sglang/srt/models/internlm2.py +16 -1
sglang/srt/models/llama2.py +16 -1
sglang/srt/models/llama_classification.py +19 -4
sglang/srt/models/llava.py +17 -2
sglang/srt/models/llavavid.py +17 -2
sglang/srt/models/minicpm.py +16 -1
sglang/srt/models/mistral.py +15 -0
sglang/srt/models/mixtral.py +16 -1
sglang/srt/models/mixtral_quant.py +16 -1
sglang/srt/models/qwen.py +16 -1
sglang/srt/models/qwen2.py +16 -1
sglang/srt/models/qwen2_moe.py +16 -1
sglang/srt/models/stablelm.py +16 -1
sglang/srt/models/yivl.py +15 -0
sglang/srt/openai_api/adapter.py +545 -160
sglang/srt/openai_api/protocol.py +65 -1
sglang/srt/sampling_params.py +20 -4
sglang/srt/server.py +90 -37
sglang/srt/server_args.py +76 -17
sglang/srt/utils.py +15 -0
sglang/test/test_programs.py +5 -1
sglang/utils.py +22 -0
sglang/version.py +1 -1
{sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/METADATA +40 -12
sglang-0.2.7.dist-info/RECORD +93 -0
{sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/WHEEL +1 -1
sglang/srt/flush_cache.py +0 -18
sglang-0.2.5.dist-info/RECORD +0 -92
{sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/LICENSE +0 -0
{sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/top_level.txt +0 -0

sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """A tensor parallel worker."""
 import logging
@@ -14,23 +29,23 @@ from sglang.global_config import global_config
 from sglang.srt.constrained.fsm_cache import FSMCache
 from sglang.srt.constrained.jump_forward import JumpForwardCache
 from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer
-from sglang.srt.managers.controller.infer_batch import (
-    FINISH_ABORT,
-    BaseFinishReason,
-    Batch,
-    ForwardMode,
-    Req,
-)
-from sglang.srt.managers.controller.model_runner import ModelRunner
-from sglang.srt.managers.controller.radix_cache import RadixCache
-from sglang.srt.managers.controller.schedule_heuristic import ScheduleHeuristic
 from sglang.srt.managers.io_struct import (
     AbortReq,
     BatchTokenIDOut,
     FlushCacheReq,
     TokenizedGenerateReqInput,
 )
+from sglang.srt.managers.policy_scheduler import PolicyScheduler
+from sglang.srt.managers.schedule_batch import (
+    FINISH_ABORT,
+    BaseFinishReason,
+    Batch,
+    ForwardMode,
+    Req,
+)
+from sglang.srt.mem_cache.radix_cache import RadixCache
 from sglang.srt.model_config import ModelConfig
+from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import (
     get_int_token_logit_bias,
@@ -40,7 +55,7 @@ from sglang.srt.utils import (
 )
 from sglang.utils import get_exception_traceback
-logger = logging.getLogger("srt.tp_worker")
+logger = logging.getLogger(__name__)
 class ModelTpServer:
@@ -59,9 +74,13 @@ class ModelTpServer:
         self.tp_rank = tp_rank
         self.tp_size = server_args.tp_size
         self.dp_size = server_args.dp_size
-        self.schedule_heuristic = server_args.schedule_heuristic
+        self.schedule_policy = server_args.schedule_policy
         self.disable_regex_jump_forward = server_args.disable_regex_jump_forward
+        # Chunked prefill
+        self.chunked_prefill_size = server_args.chunked_prefill_size
+        self.current_inflight_req = None
         # Init model and tokenizer
         self.model_config = ModelConfig(
             server_args.model_path,
@@ -98,22 +117,26 @@ class ModelTpServer:
             if server_args.max_prefill_tokens is None
             else server_args.max_prefill_tokens
         )
-        self.max_running_requests = (
-            self.max_total_num_tokens // 2
-            if server_args.max_running_requests is None
-            else server_args.max_running_requests
-        )
         self.max_running_requests = min(
-            self.max_running_requests, self.model_runner.req_to_token_pool.size - 1
+            (
+                self.max_total_num_tokens // 2
+                if server_args.max_running_requests is None
+                else server_args.max_running_requests
+            ),
+            self.model_runner.req_to_token_pool.size - 1,
         )
         self.int_token_logit_bias = torch.tensor(
             get_int_token_logit_bias(self.tokenizer, self.model_config.vocab_size)
         )
+        self.max_req_input_len = min(
+            self.model_config.context_len - 1,
+            self.max_total_num_tokens - 1,
+        )
         set_random_seed(server_args.random_seed)
         # Print info
         logger.info(
-            f"[gpu_id={self.gpu_id}] "
+            f"[gpu={self.gpu_id}] "
             f"max_total_num_tokens={self.max_total_num_tokens}, "
             f"max_prefill_tokens={self.max_prefill_tokens}, "
             f"max_running_requests={self.max_running_requests}, "
@@ -127,8 +150,8 @@ class ModelTpServer:
             disable=server_args.disable_radix_cache,
         )
         self.tree_cache_metrics = {"total": 0, "hit": 0}
-        self.scheduler = ScheduleHeuristic(
-            self.schedule_heuristic,
+        self.scheduler = PolicyScheduler(
+            self.schedule_policy,
             self.max_running_requests,
             self.max_prefill_tokens,
             self.max_total_num_tokens,
@@ -138,7 +161,7 @@ class ModelTpServer:
         self.token_to_kv_pool = self.model_runner.token_to_kv_pool
         # Init running status
-        self.forward_queue: List[Req] = []
+        self.waiting_queue: List[Req] = []
         self.running_batch: Batch = None
         self.out_pyobjs = []
         self.decode_forward_ct = 0
@@ -201,6 +224,7 @@ class ModelTpServer:
             # Run a new prefill batch
             self.forward_prefill_batch(new_batch)
             self.cache_filled_batch(new_batch)
+            self.filter_out_inflight(new_batch)
             if not new_batch.is_empty():
                 if self.running_batch is None:
@@ -237,12 +261,12 @@ class ModelTpServer:
         self.num_generated_tokens = 0
         self.last_stats_tic = time.time()
         logger.info(
-            f"[gpu_id={self.gpu_id}] Decode batch. "
+            f"[gpu={self.gpu_id}] Decode batch. "
             f"#running-req: {len(self.running_batch.reqs)}, "
             f"#token: {num_used}, "
             f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
             f"gen throughput (token/s): {throughput:.2f}, "
-            f"#queue-req: {len(self.forward_queue)}"
+            f"#queue-req: {len(self.waiting_queue)}"
         )
     def check_memory(self):
@@ -295,21 +319,24 @@ class ModelTpServer:
                 )
         # Truncate prompts that are too long
-        req.origin_input_ids = req.origin_input_ids[: self.model_config.context_len - 1]
+        if len(req.origin_input_ids) >= self.max_req_input_len:
+            logger.warn(
+                "Request length is longer than the KV cache pool size or "
+                "the max context length. Truncated!!!"
+            )
+            req.origin_input_ids = req.origin_input_ids[: self.max_req_input_len]
         req.sampling_params.max_new_tokens = min(
-            req.sampling_params.max_new_tokens,
-            self.model_config.context_len - 1 - len(req.origin_input_ids),
-            self.max_total_num_tokens - 128 - len(req.origin_input_ids),
+            (
+                req.sampling_params.max_new_tokens
+                if req.sampling_params.max_new_tokens is not None
+                else 1 << 30
+            ),
+            self.max_req_input_len - 1 - len(req.origin_input_ids),
         )
-        if req.sampling_params.max_new_tokens < 0:
-            req.origin_input_ids = req.origin_input_ids[
-                : self.max_total_num_tokens - 128
-            ]
-            logger.error("Request longer than memory pool size, truncated!!!")
-        self.forward_queue.append(req)
+        self.waiting_queue.append(req)
     def get_new_prefill_batch(self) -> Optional[Batch]:
+        # TODO(lsyin): organize this function
         running_bs = (
             len(self.running_batch.reqs) if self.running_batch is not None else 0
         )
@@ -317,7 +344,7 @@ class ModelTpServer:
             return
         # Compute matched prefix length
-        for req in self.forward_queue:
+        for req in self.waiting_queue:
             req.input_ids = req.origin_input_ids + req.output_ids
             prefix_indices, last_node = self.tree_cache.match_prefix(req.input_ids)
             if req.return_logprob:
@@ -327,7 +354,7 @@ class ModelTpServer:
             req.last_node = last_node
         # Get priority queue
-        self.forward_queue = self.scheduler.get_priority_queue(self.forward_queue)
+        self.waiting_queue = self.scheduler.get_priority_queue(self.waiting_queue)
         # Add requests if there is available space
         can_run_list = []
@@ -346,7 +373,33 @@ class ModelTpServer:
                 ]
             )
-        for req in self.forward_queue:
+        # Handle the current inflight request
+        take_inflight = 0
+        if self.current_inflight_req:
+            take_inflight = 1
+            r = self.current_inflight_req
+            r.input_ids = r.origin_input_ids + r.output_ids
+            truncated = (
+                len(r.input_ids) - len(r.prefix_indices) > self.chunked_prefill_size
+            )
+            r.extend_input_len = min(
+                len(r.input_ids) - len(r.prefix_indices), self.chunked_prefill_size
+            )
+            r.input_ids = r.input_ids[: len(r.prefix_indices) + r.extend_input_len]
+            can_run_list.append(r)
+            if not truncated:
+                # Finish inflight
+                self.current_inflight_req = None
+                new_batch_total_tokens += (
+                    r.extend_input_len + r.sampling_params.max_new_tokens
+                )
+                new_batch_input_tokens += r.extend_input_len
+            else:
+                new_batch_total_tokens += r.extend_input_len
+                new_batch_input_tokens += r.extend_input_len
+        for req in self.waiting_queue:
             if req.return_logprob and req.normalized_prompt_logprob is None:
                 # Need at least two tokens to compute normalized logprob
                 if req.extend_input_len < 2:
@@ -388,11 +441,39 @@ class ModelTpServer:
                     break
                 else:
                     # Add this request to the running batch
-                    can_run_list.append(req)
-                    new_batch_total_tokens += (
-                        req.extend_input_len + req.sampling_params.max_new_tokens
-                    )
-                    new_batch_input_tokens += req.extend_input_len
+                    if (
+                        self.chunked_prefill_size is None
+                        or (
+                            new_batch_input_tokens + req.extend_input_len
+                            <= self.chunked_prefill_size
+                        )
+                        or (
+                            req.return_logprob and req.normalized_prompt_logprob is None
+                        )
+                    ):
+                        can_run_list.append(req)
+                        new_batch_total_tokens += (
+                            req.extend_input_len + req.sampling_params.max_new_tokens
+                        )
+                        new_batch_input_tokens += req.extend_input_len
+                    else:
+                        trunc_len = self.chunked_prefill_size - new_batch_input_tokens
+                        if trunc_len <= 0:
+                            # Undo locking
+                            delta = self.tree_cache.dec_lock_ref(req.last_node)
+                            available_size += delta
+                            break
+                        req.extend_input_len = trunc_len
+                        req.input_ids = req.input_ids[
+                            : len(req.prefix_indices) + req.extend_input_len
+                        ]
+                        can_run_list.append(req)
+                        self.current_inflight_req = req
+                        new_batch_input_tokens += req.extend_input_len
+                        new_batch_total_tokens += req.extend_input_len
+                        break
             else:
                 break
@@ -413,13 +494,13 @@ class ModelTpServer:
                 self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
             )
             logger.info(
-                f"[gpu_id={self.gpu_id}] Prefill batch. "
+                f"[gpu={self.gpu_id}] Prefill batch. "
                 f"#new-seq: {len(can_run_list)}, "
                 f"#new-token: {new_batch_input_tokens}, "
                 f"#cached-token: {hit_tokens}, "
                 f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
                 f"#running-req: {running_bs}, "
-                f"#queue-req: {len(self.forward_queue) - len(can_run_list)}"
+                f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + take_inflight}"
             )
         # Return the new batch
@@ -429,7 +510,7 @@ class ModelTpServer:
             self.token_to_kv_pool,
             self.tree_cache,
         )
-        self.forward_queue = [x for x in self.forward_queue if x not in can_run_list]
+        self.waiting_queue = [x for x in self.waiting_queue if x not in can_run_list]
         return new_batch
     def forward_prefill_batch(self, batch: Batch):
@@ -449,7 +530,7 @@ class ModelTpServer:
                     torch.arange(len(next_token_ids), device=next_token_ids.device),
                     next_token_ids,
                 ].tolist()
-                output.prefill_token_logprobs = output.prefill_token_logprobs.tolist()
+                output.input_token_logprobs = output.input_token_logprobs.tolist()
                 output.normalized_prompt_logprobs = (
                     output.normalized_prompt_logprobs.tolist()
                 )
@@ -461,9 +542,10 @@ class ModelTpServer:
         # Check finish conditions
         pt = 0
         for i, req in enumerate(batch.reqs):
-            req.completion_tokens_wo_jump_forward += 1
-            req.output_ids.append(next_token_ids[i])
-            req.check_finished()
+            if req is not self.current_inflight_req:
+                req.completion_tokens_wo_jump_forward += 1
+                req.output_ids.append(next_token_ids[i])
+                req.check_finished()
             if req.return_logprob:
                 self.add_logprob_return_values(i, req, pt, next_token_ids, output)
@@ -475,24 +557,24 @@ class ModelTpServer:
         if req.normalized_prompt_logprob is None:
             req.normalized_prompt_logprob = output.normalized_prompt_logprobs[i]
-        if req.prefill_token_logprobs is None:
+        if req.input_token_logprobs is None:
             # If logprob_start_len > 0, then first logprob_start_len prompt tokens will be ignored.
-            req.prefill_token_logprobs = list(
+            req.input_token_logprobs = list(
                 zip(
-                    output.prefill_token_logprobs[pt : pt + req.extend_input_len - 1],
+                    output.input_token_logprobs[pt : pt + req.extend_input_len - 1],
                     req.input_ids[-req.extend_input_len + 1 :],
                 )
             )
             if req.logprob_start_len == 0:
-                req.prefill_token_logprobs = [
+                req.input_token_logprobs = [
                     (None, req.input_ids[0])
-                ] + req.prefill_token_logprobs
+                ] + req.input_token_logprobs
         if req.last_update_decode_tokens != 0:
-            req.decode_token_logprobs.extend(
+            req.output_token_logprobs.extend(
                 list(
                     zip(
-                        output.prefill_token_logprobs[
+                        output.input_token_logprobs[
                             pt
                             + req.extend_input_len
                             - req.last_update_decode_tokens : pt
@@ -504,27 +586,27 @@ class ModelTpServer:
                 )
             )
-        req.decode_token_logprobs.append(
+        req.output_token_logprobs.append(
             (output.next_token_logprobs[i], next_token_ids[i])
         )
         if req.top_logprobs_num > 0:
-            if req.prefill_top_logprobs is None:
-                req.prefill_top_logprobs = output.prefill_top_logprobs[i]
+            if req.input_top_logprobs is None:
+                req.input_top_logprobs = output.input_top_logprobs[i]
                 if req.logprob_start_len == 0:
-                    req.prefill_top_logprobs = [None] + req.prefill_top_logprobs
+                    req.input_top_logprobs = [None] + req.input_top_logprobs
             if req.last_update_decode_tokens != 0:
-                req.decode_top_logprobs.extend(
-                    output.prefill_top_logprobs[i][-req.last_update_decode_tokens + 1 :]
+                req.output_top_logprobs.extend(
+                    output.input_top_logprobs[i][-req.last_update_decode_tokens + 1 :]
                 )
-            req.decode_top_logprobs.append(output.decode_top_logprobs[i])
+            req.output_top_logprobs.append(output.output_top_logprobs[i])
     def cache_filled_batch(self, batch: Batch):
         req_pool_indices_cpu = batch.req_pool_indices.cpu().numpy()
         for i, req in enumerate(batch.reqs):
             new_prefix_indices, new_last_node = self.tree_cache.cache_req(
-                token_ids=tuple(req.origin_input_ids + req.output_ids)[:-1],
+                token_ids=tuple(req.input_ids),
                 last_uncached_pos=len(req.prefix_indices),
                 req_pool_idx=req_pool_indices_cpu[i],
                 del_in_memory_pool=False,
@@ -532,6 +614,10 @@ class ModelTpServer:
             )
             req.prefix_indices, req.last_node = new_prefix_indices, new_last_node
+            if req is self.current_inflight_req:
+                # inflight request would get a new req idx
+                self.req_to_token_pool.free(int(req_pool_indices_cpu[i]))
     def forward_decode_batch(self, batch: Batch):
         # Check if decode out of memory
         if not batch.check_decode_mem():
@@ -545,7 +631,7 @@ class ModelTpServer:
                 f"#retracted_reqs: {len(retracted_reqs)}, "
                 f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
             )
-            self.forward_queue.extend(retracted_reqs)
+            self.waiting_queue.extend(retracted_reqs)
         else:
             self.new_token_ratio = max(
                 self.new_token_ratio - self.new_token_ratio_decay,
@@ -555,7 +641,7 @@ class ModelTpServer:
         if not self.disable_regex_jump_forward:
             # Check for jump-forward
             jump_forward_reqs = batch.check_for_jump_forward(self.model_runner)
-            self.forward_queue.extend(jump_forward_reqs)
+            self.waiting_queue.extend(jump_forward_reqs)
             if batch.is_empty():
                 return
@@ -583,11 +669,11 @@ class ModelTpServer:
             req.check_finished()
             if req.return_logprob:
-                req.decode_token_logprobs.append(
+                req.output_token_logprobs.append(
                     (next_token_logprobs[i], next_token_id)
                 )
                 if req.top_logprobs_num > 0:
-                    req.decode_top_logprobs.append(output.decode_top_logprobs[i])
+                    req.output_top_logprobs.append(output.output_top_logprobs[i])
         self.handle_finished_requests(batch)
@@ -639,16 +725,16 @@ class ModelTpServer:
                 }
                 if req.return_logprob:
                     (
-                        meta_info["prefill_token_logprobs"],
-                        meta_info["decode_token_logprobs"],
-                        meta_info["prefill_top_logprobs"],
-                        meta_info["decode_top_logprobs"],
+                        meta_info["input_token_logprobs"],
+                        meta_info["output_token_logprobs"],
+                        meta_info["input_top_logprobs"],
+                        meta_info["output_top_logprobs"],
                         meta_info["normalized_prompt_logprob"],
                     ) = (
-                        req.prefill_token_logprobs,
-                        req.decode_token_logprobs,
-                        req.prefill_top_logprobs,
-                        req.decode_top_logprobs,
+                        req.input_token_logprobs,
+                        req.output_token_logprobs,
+                        req.input_top_logprobs,
+                        req.output_top_logprobs,
                         req.normalized_prompt_logprob,
                     )
                 output_meta_info.append(meta_info)
@@ -690,8 +776,18 @@ class ModelTpServer:
             else:
                 batch.reqs = []
+    def filter_out_inflight(self, batch: Batch):
+        # TODO(lsyin): reduce the overhead, make a special version for this
+        if self.current_inflight_req is None:
+            return
+        to_remove = batch.reqs.index(self.current_inflight_req)
+        unfinished_indices = [i for i in range(len(batch.reqs)) if i != to_remove]
+        batch.filter_batch(unfinished_indices)
     def flush_cache(self):
-        if len(self.forward_queue) == 0 and (
+        if len(self.waiting_queue) == 0 and (
             self.running_batch is None or len(self.running_batch.reqs) == 0
         ):
             self.tree_cache.reset()
@@ -704,20 +800,20 @@ class ModelTpServer:
         else:
             warnings.warn(
                 f"Cache not flushed because there are pending requests. "
-                f"#queue-req: {len(self.forward_queue)}, "
+                f"#queue-req: {len(self.waiting_queue)}, "
                 f"#running-req: {0 if self.running_batch is None else len(self.running_batch.reqs)}"
             )
     def abort_request(self, recv_req):
         # Delete requests in the waiting queue
         to_del = None
-        for i, req in enumerate(self.forward_queue):
+        for i, req in enumerate(self.waiting_queue):
             if req.rid == recv_req.rid:
                 to_del = i
                 break
         if to_del is not None:
-            del self.forward_queue[to_del]
+            del self.waiting_queue[to_del]
         # Delete requests in the running batch
         if self.running_batch:

sglang/srt/mem_cache/flush_cache.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""
+Flush the KV cache.
+Usage:
+python3 -m sglang.srt.mem_cache.flush_cache --url http://localhost:30000
+"""
+import argparse
+import requests
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--url", type=str, default="http://localhost:30000")
+    args = parser.parse_args()
+    response = requests.get(args.url + "/flush_cache")
+    assert response.status_code == 200

sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Memory pool."""
 import logging
@@ -30,7 +45,7 @@ class ReqToTokenPool:
         return select_index
-    def free(self, free_index: int):
+    def free(self, free_index):
         self.mem_state[free_index] = True
         if isinstance(free_index, (int,)):
             self.can_use_mem_size += 1

sglang/srt/{managers/controller → mem_cache}/radix_cache.py RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """
 The radix tree data structure for managing the KV cache.
 """

sglang/srt/mm_utils.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 # Source: https://github.com/haotian-liu/LLaVA/blob/main/llava/mm_utils.py
 import ast
 import base64

sglang/srt/model_config.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 from typing import Optional
 from transformers import PretrainedConfig
@@ -36,6 +51,11 @@ class ModelConfig:
             "head_dim",
             self.hf_config.hidden_size // self.hf_config.num_attention_heads,
         )
+        # FIXME: temporary special judge for deepseek v2 MLA architecture
+        if "DeepseekV2ForCausalLM" in self.hf_config.architectures:
+            self.head_dim = 256
         self.num_attention_heads = self.hf_config.num_attention_heads
         self.num_key_value_heads = getattr(self.hf_config, "num_key_value_heads", None)

sglang 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

sglang 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl