PyPI - sglang - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl - Mend

sglang 0.2.6py3-none-any.whl → 0.2.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

sglang/__init__.py +33 -26
sglang/api.py +9 -1
sglang/bench_latency.py +2 -2
sglang/bench_serving.py +10 -1
sglang/check_env.py +1 -1
sglang/lang/backend/litellm.py +1 -1
sglang/lang/backend/openai.py +1 -1
sglang/lang/interpreter.py +21 -5
sglang/lang/ir.py +1 -2
sglang/srt/constrained/__init__.py +15 -0
sglang/srt/constrained/{base_cache.py → base_tool_cache.py} +17 -2
sglang/srt/constrained/fsm_cache.py +17 -2
sglang/srt/constrained/jump_forward.py +17 -2
sglang/srt/conversation.py +26 -0
sglang/srt/hf_transformers_utils.py +15 -0
sglang/srt/layers/context_flashattention_nopad.py +15 -0
sglang/srt/layers/extend_attention.py +15 -0
sglang/srt/layers/fused_moe.py +15 -0
sglang/srt/layers/linear.py +15 -0
sglang/srt/layers/logits_processor.py +41 -13
sglang/srt/layers/quantization/__init__.py +15 -0
sglang/srt/layers/quantization/fp8.py +15 -0
sglang/srt/layers/radix_attention.py +17 -2
sglang/srt/layers/token_attention.py +16 -1
sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
sglang/srt/managers/detokenizer_manager.py +16 -1
sglang/srt/managers/io_struct.py +36 -3
sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +60 -21
sglang/srt/managers/tokenizer_manager.py +39 -16
sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +159 -46
sglang/srt/mem_cache/base_cache.py +43 -0
sglang/srt/mem_cache/chunk_cache.py +60 -0
sglang/srt/mem_cache/flush_cache.py +33 -0
sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
sglang/srt/{managers/controller → mem_cache}/radix_cache.py +20 -2
sglang/srt/mm_utils.py +15 -0
sglang/srt/model_config.py +15 -0
sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +16 -1
sglang/srt/{managers/controller → model_executor}/model_runner.py +49 -14
sglang/srt/model_loader/model_loader.py +15 -0
sglang/srt/model_loader/utils.py +16 -1
sglang/srt/models/chatglm.py +16 -1
sglang/srt/models/commandr.py +16 -1
sglang/srt/models/dbrx.py +16 -1
sglang/srt/models/deepseek.py +16 -1
sglang/srt/models/deepseek_v2.py +16 -1
sglang/srt/models/gemma.py +16 -1
sglang/srt/models/gemma2.py +16 -1
sglang/srt/models/gpt_bigcode.py +16 -1
sglang/srt/models/grok.py +16 -1
sglang/srt/models/internlm2.py +16 -1
sglang/srt/models/llama2.py +21 -22
sglang/srt/models/llama_classification.py +16 -1
sglang/srt/models/llava.py +17 -2
sglang/srt/models/llavavid.py +17 -2
sglang/srt/models/minicpm.py +16 -1
sglang/srt/models/mistral.py +15 -0
sglang/srt/models/mixtral.py +16 -1
sglang/srt/models/mixtral_quant.py +16 -1
sglang/srt/models/qwen.py +16 -1
sglang/srt/models/qwen2.py +16 -1
sglang/srt/models/qwen2_moe.py +16 -1
sglang/srt/models/stablelm.py +16 -1
sglang/srt/models/yivl.py +15 -0
sglang/srt/openai_api/adapter.py +569 -131
sglang/srt/openai_api/protocol.py +84 -2
sglang/srt/sampling_params.py +15 -0
sglang/srt/server.py +92 -23
sglang/srt/server_args.py +52 -11
sglang/srt/utils.py +15 -0
sglang/test/test_programs.py +9 -6
sglang/utils.py +22 -0
sglang/version.py +1 -1
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/METADATA +33 -7
sglang-0.2.8.dist-info/RECORD +95 -0
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/WHEEL +1 -1
sglang/srt/flush_cache.py +0 -18
sglang-0.2.6.dist-info/RECORD +0 -93
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/LICENSE +0 -0
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/top_level.txt +0 -0

sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """A tensor parallel worker."""
 import logging
@@ -14,23 +29,24 @@ from sglang.global_config import global_config
 from sglang.srt.constrained.fsm_cache import FSMCache
 from sglang.srt.constrained.jump_forward import JumpForwardCache
 from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer
-from sglang.srt.managers.controller.infer_batch import (
-    FINISH_ABORT,
-    BaseFinishReason,
-    Batch,
-    ForwardMode,
-    Req,
-)
-from sglang.srt.managers.controller.model_runner import ModelRunner
-from sglang.srt.managers.controller.radix_cache import RadixCache
-from sglang.srt.managers.controller.schedule_heuristic import ScheduleHeuristic
 from sglang.srt.managers.io_struct import (
     AbortReq,
     BatchTokenIDOut,
     FlushCacheReq,
     TokenizedGenerateReqInput,
 )
+from sglang.srt.managers.policy_scheduler import PolicyScheduler
+from sglang.srt.managers.schedule_batch import (
+    FINISH_ABORT,
+    BaseFinishReason,
+    Batch,
+    ForwardMode,
+    Req,
+)
+from sglang.srt.mem_cache.chunk_cache import ChunkCache
+from sglang.srt.mem_cache.radix_cache import RadixCache
 from sglang.srt.model_config import ModelConfig
+from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import (
     get_int_token_logit_bias,
@@ -40,7 +56,7 @@ from sglang.srt.utils import (
 )
 from sglang.utils import get_exception_traceback
-logger = logging.getLogger("srt.tp_worker")
+logger = logging.getLogger(__name__)
 class ModelTpServer:
@@ -59,9 +75,13 @@ class ModelTpServer:
         self.tp_rank = tp_rank
         self.tp_size = server_args.tp_size
         self.dp_size = server_args.dp_size
-        self.schedule_heuristic = server_args.schedule_heuristic
+        self.schedule_policy = server_args.schedule_policy
         self.disable_regex_jump_forward = server_args.disable_regex_jump_forward
+        # Chunked prefill
+        self.chunked_prefill_size = server_args.chunked_prefill_size
+        self.current_inflight_req = None
         # Init model and tokenizer
         self.model_config = ModelConfig(
             server_args.model_path,
@@ -117,7 +137,7 @@ class ModelTpServer:
         # Print info
         logger.info(
-            f"[gpu_id={self.gpu_id}] "
+            f"[gpu={self.gpu_id}] "
             f"max_total_num_tokens={self.max_total_num_tokens}, "
             f"max_prefill_tokens={self.max_prefill_tokens}, "
             f"max_running_requests={self.max_running_requests}, "
@@ -125,14 +145,23 @@ class ModelTpServer:
         )
         # Init cache
-        self.tree_cache = RadixCache(
-            req_to_token_pool=self.model_runner.req_to_token_pool,
-            token_to_kv_pool=self.model_runner.token_to_kv_pool,
-            disable=server_args.disable_radix_cache,
-        )
+        if (
+            server_args.chunked_prefill_size is not None
+            and server_args.disable_radix_cache
+        ):
+            self.tree_cache = ChunkCache(
+                req_to_token_pool=self.model_runner.req_to_token_pool,
+                token_to_kv_pool=self.model_runner.token_to_kv_pool,
+            )
+        else:
+            self.tree_cache = RadixCache(
+                req_to_token_pool=self.model_runner.req_to_token_pool,
+                token_to_kv_pool=self.model_runner.token_to_kv_pool,
+                disable=server_args.disable_radix_cache,
+            )
         self.tree_cache_metrics = {"total": 0, "hit": 0}
-        self.scheduler = ScheduleHeuristic(
-            self.schedule_heuristic,
+        self.scheduler = PolicyScheduler(
+            self.schedule_policy,
             self.max_running_requests,
             self.max_prefill_tokens,
             self.max_total_num_tokens,
@@ -142,7 +171,7 @@ class ModelTpServer:
         self.token_to_kv_pool = self.model_runner.token_to_kv_pool
         # Init running status
-        self.forward_queue: List[Req] = []
+        self.waiting_queue: List[Req] = []
         self.running_batch: Batch = None
         self.out_pyobjs = []
         self.decode_forward_ct = 0
@@ -205,6 +234,7 @@ class ModelTpServer:
             # Run a new prefill batch
             self.forward_prefill_batch(new_batch)
             self.cache_filled_batch(new_batch)
+            self.filter_out_inflight(new_batch)
             if not new_batch.is_empty():
                 if self.running_batch is None:
@@ -241,12 +271,12 @@ class ModelTpServer:
         self.num_generated_tokens = 0
         self.last_stats_tic = time.time()
         logger.info(
-            f"[gpu_id={self.gpu_id}] Decode batch. "
+            f"[gpu={self.gpu_id}] Decode batch. "
             f"#running-req: {len(self.running_batch.reqs)}, "
             f"#token: {num_used}, "
             f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
             f"gen throughput (token/s): {throughput:.2f}, "
-            f"#queue-req: {len(self.forward_queue)}"
+            f"#queue-req: {len(self.waiting_queue)}"
         )
     def check_memory(self):
@@ -260,6 +290,14 @@ class ModelTpServer:
                 "KV cache pool leak detected!"
             )
+        if self.req_to_token_pool.can_use_mem_size != self.req_to_token_pool.size:
+            warnings.warn(
+                "Warning: "
+                f"available req slots={self.req_to_token_pool.can_use_mem_size}, "
+                f"total slots={self.req_to_token_pool.size}\n"
+                "Memory pool leak detected!"
+            )
     def handle_generate_request(
         self,
         recv_req: TokenizedGenerateReqInput,
@@ -313,9 +351,10 @@ class ModelTpServer:
             ),
             self.max_req_input_len - 1 - len(req.origin_input_ids),
         )
-        self.forward_queue.append(req)
+        self.waiting_queue.append(req)
     def get_new_prefill_batch(self) -> Optional[Batch]:
+        # TODO(lsyin): organize this function
         running_bs = (
             len(self.running_batch.reqs) if self.running_batch is not None else 0
         )
@@ -323,9 +362,12 @@ class ModelTpServer:
             return
         # Compute matched prefix length
-        for req in self.forward_queue:
+        for req in self.waiting_queue:
             req.input_ids = req.origin_input_ids + req.output_ids
-            prefix_indices, last_node = self.tree_cache.match_prefix(req.input_ids)
+            prefix_indices, last_node = self.tree_cache.match_prefix(
+                rid=req.rid,
+                key=req.input_ids,
+            )
             if req.return_logprob:
                 prefix_indices = prefix_indices[: req.logprob_start_len]
             req.extend_input_len = len(req.input_ids) - len(prefix_indices)
@@ -333,7 +375,7 @@ class ModelTpServer:
             req.last_node = last_node
         # Get priority queue
-        self.forward_queue = self.scheduler.get_priority_queue(self.forward_queue)
+        self.waiting_queue = self.scheduler.get_priority_queue(self.waiting_queue)
         # Add requests if there is available space
         can_run_list = []
@@ -352,7 +394,33 @@ class ModelTpServer:
                 ]
             )
-        for req in self.forward_queue:
+        # Handle the current inflight request
+        take_inflight = 0
+        if self.current_inflight_req:
+            take_inflight = 1
+            r = self.current_inflight_req
+            r.input_ids = r.origin_input_ids + r.output_ids
+            truncated = (
+                len(r.input_ids) - len(r.prefix_indices) > self.chunked_prefill_size
+            )
+            r.extend_input_len = min(
+                len(r.input_ids) - len(r.prefix_indices), self.chunked_prefill_size
+            )
+            r.input_ids = r.input_ids[: len(r.prefix_indices) + r.extend_input_len]
+            can_run_list.append(r)
+            if not truncated:
+                # Finish inflight
+                self.current_inflight_req = None
+                new_batch_total_tokens += (
+                    r.extend_input_len + r.sampling_params.max_new_tokens
+                )
+                new_batch_input_tokens += r.extend_input_len
+            else:
+                new_batch_total_tokens += r.extend_input_len
+                new_batch_input_tokens += r.extend_input_len
+        for req in self.waiting_queue:
             if req.return_logprob and req.normalized_prompt_logprob is None:
                 # Need at least two tokens to compute normalized logprob
                 if req.extend_input_len < 2:
@@ -394,11 +462,39 @@ class ModelTpServer:
                     break
                 else:
                     # Add this request to the running batch
-                    can_run_list.append(req)
-                    new_batch_total_tokens += (
-                        req.extend_input_len + req.sampling_params.max_new_tokens
-                    )
-                    new_batch_input_tokens += req.extend_input_len
+                    if (
+                        self.chunked_prefill_size is None
+                        or (
+                            new_batch_input_tokens + req.extend_input_len
+                            <= self.chunked_prefill_size
+                        )
+                        or (
+                            req.return_logprob and req.normalized_prompt_logprob is None
+                        )
+                    ):
+                        can_run_list.append(req)
+                        new_batch_total_tokens += (
+                            req.extend_input_len + req.sampling_params.max_new_tokens
+                        )
+                        new_batch_input_tokens += req.extend_input_len
+                    else:
+                        trunc_len = self.chunked_prefill_size - new_batch_input_tokens
+                        if trunc_len <= 0:
+                            # Undo locking
+                            delta = self.tree_cache.dec_lock_ref(req.last_node)
+                            available_size += delta
+                            break
+                        req.extend_input_len = trunc_len
+                        req.input_ids = req.input_ids[
+                            : len(req.prefix_indices) + req.extend_input_len
+                        ]
+                        can_run_list.append(req)
+                        self.current_inflight_req = req
+                        new_batch_input_tokens += req.extend_input_len
+                        new_batch_total_tokens += req.extend_input_len
+                        break
             else:
                 break
@@ -419,13 +515,13 @@ class ModelTpServer:
                 self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
             )
             logger.info(
-                f"[gpu_id={self.gpu_id}] Prefill batch. "
+                f"[gpu={self.gpu_id}] Prefill batch. "
                 f"#new-seq: {len(can_run_list)}, "
                 f"#new-token: {new_batch_input_tokens}, "
                 f"#cached-token: {hit_tokens}, "
                 f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
                 f"#running-req: {running_bs}, "
-                f"#queue-req: {len(self.forward_queue) - len(can_run_list)}"
+                f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + take_inflight}"
             )
         # Return the new batch
@@ -435,7 +531,7 @@ class ModelTpServer:
             self.token_to_kv_pool,
             self.tree_cache,
         )
-        self.forward_queue = [x for x in self.forward_queue if x not in can_run_list]
+        self.waiting_queue = [x for x in self.waiting_queue if x not in can_run_list]
         return new_batch
     def forward_prefill_batch(self, batch: Batch):
@@ -467,9 +563,10 @@ class ModelTpServer:
         # Check finish conditions
         pt = 0
         for i, req in enumerate(batch.reqs):
-            req.completion_tokens_wo_jump_forward += 1
-            req.output_ids.append(next_token_ids[i])
-            req.check_finished()
+            if req is not self.current_inflight_req:
+                req.completion_tokens_wo_jump_forward += 1
+                req.output_ids.append(next_token_ids[i])
+                req.check_finished()
             if req.return_logprob:
                 self.add_logprob_return_values(i, req, pt, next_token_ids, output)
@@ -530,7 +627,8 @@ class ModelTpServer:
         req_pool_indices_cpu = batch.req_pool_indices.cpu().numpy()
         for i, req in enumerate(batch.reqs):
             new_prefix_indices, new_last_node = self.tree_cache.cache_req(
-                token_ids=tuple(req.origin_input_ids + req.output_ids)[:-1],
+                rid=req.rid,
+                token_ids=tuple(req.input_ids),
                 last_uncached_pos=len(req.prefix_indices),
                 req_pool_idx=req_pool_indices_cpu[i],
                 del_in_memory_pool=False,
@@ -538,6 +636,10 @@ class ModelTpServer:
             )
             req.prefix_indices, req.last_node = new_prefix_indices, new_last_node
+            if req is self.current_inflight_req:
+                # inflight request would get a new req idx
+                self.req_to_token_pool.free(int(req_pool_indices_cpu[i]))
     def forward_decode_batch(self, batch: Batch):
         # Check if decode out of memory
         if not batch.check_decode_mem():
@@ -551,7 +653,7 @@ class ModelTpServer:
                 f"#retracted_reqs: {len(retracted_reqs)}, "
                 f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
             )
-            self.forward_queue.extend(retracted_reqs)
+            self.waiting_queue.extend(retracted_reqs)
         else:
             self.new_token_ratio = max(
                 self.new_token_ratio - self.new_token_ratio_decay,
@@ -561,7 +663,7 @@ class ModelTpServer:
         if not self.disable_regex_jump_forward:
             # Check for jump-forward
             jump_forward_reqs = batch.check_for_jump_forward(self.model_runner)
-            self.forward_queue.extend(jump_forward_reqs)
+            self.waiting_queue.extend(jump_forward_reqs)
             if batch.is_empty():
                 return
@@ -683,6 +785,7 @@ class ModelTpServer:
             for i in finished_indices:
                 req = batch.reqs[i]
                 self.tree_cache.cache_req(
+                    rid=req.rid,
                     token_ids=tuple(req.origin_input_ids + req.output_ids)[:-1],
                     last_uncached_pos=len(req.prefix_indices),
                     req_pool_idx=req_pool_indices_cpu[i],
@@ -696,8 +799,18 @@ class ModelTpServer:
             else:
                 batch.reqs = []
+    def filter_out_inflight(self, batch: Batch):
+        # TODO(lsyin): reduce the overhead, make a special version for this
+        if self.current_inflight_req is None:
+            return
+        to_remove = batch.reqs.index(self.current_inflight_req)
+        unfinished_indices = [i for i in range(len(batch.reqs)) if i != to_remove]
+        batch.filter_batch(unfinished_indices)
     def flush_cache(self):
-        if len(self.forward_queue) == 0 and (
+        if len(self.waiting_queue) == 0 and (
             self.running_batch is None or len(self.running_batch.reqs) == 0
         ):
             self.tree_cache.reset()
@@ -710,20 +823,20 @@ class ModelTpServer:
         else:
             warnings.warn(
                 f"Cache not flushed because there are pending requests. "
-                f"#queue-req: {len(self.forward_queue)}, "
+                f"#queue-req: {len(self.waiting_queue)}, "
                 f"#running-req: {0 if self.running_batch is None else len(self.running_batch.reqs)}"
             )
     def abort_request(self, recv_req):
         # Delete requests in the waiting queue
         to_del = None
-        for i, req in enumerate(self.forward_queue):
+        for i, req in enumerate(self.waiting_queue):
             if req.rid == recv_req.rid:
                 to_del = i
                 break
         if to_del is not None:
-            del self.forward_queue[to_del]
+            del self.waiting_queue[to_del]
         # Delete requests in the running batch
         if self.running_batch:

sglang/srt/mem_cache/base_cache.py ADDED Viewed

@@ -0,0 +1,43 @@
+from abc import ABC, abstractmethod
+class BasePrefixCache(ABC):
+    """Cache can be indexed by either rid or key."""
+    @abstractmethod
+    def reset(self):
+        pass
+    @abstractmethod
+    def match_prefix(self, **kwargs):
+        pass
+    @abstractmethod
+    def insert(self, **kwargs):
+        pass
+    @abstractmethod
+    def cache_req(self, **kwargs):
+        pass
+    @abstractmethod
+    def evict(self, num_tokens, evict_callback):
+        pass
+    @abstractmethod
+    def inc_lock_ref(self, node):
+        pass
+    @abstractmethod
+    def dec_lock_ref(self, node):
+        pass
+    @abstractmethod
+    def evictable_size(self):
+        pass
+    def total_size(self):
+        raise NotImplementedError
+    def pretty_print(self):
+        raise NotImplementedError

sglang/srt/mem_cache/chunk_cache.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Cache for chunked prefill, used when RadixCache is disabled."""
+from sglang.srt.mem_cache.base_cache import BasePrefixCache
+class ChunkCacheEntry:
+    def __init__(self, rid, value):
+        self.rid = rid
+        self.value = value
+class ChunkCache(BasePrefixCache):
+    def __init__(self, req_to_token_pool, token_to_kv_pool):
+        self.disable = True
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool = token_to_kv_pool
+        self.reset()
+    def reset(self):
+        self.entries = {}
+    def match_prefix(self, rid, **kwargs):
+        if rid not in self.entries:
+            return [], None
+        entry = self.entries[rid]
+        return entry.value, entry
+    def cache_req(
+        self, rid, token_ids, req_pool_idx, del_in_memory_pool=True, **kwargs
+    ):
+        indices = self.req_to_token_pool.req_to_token[req_pool_idx, : len(token_ids)]
+        if del_in_memory_pool:
+            assert rid in self.entries
+            self.req_to_token_pool.free(req_pool_idx)
+            self.token_to_kv_pool.free(indices)
+            return
+        if rid not in self.entries:
+            self.entries[rid] = ChunkCacheEntry(rid, indices)
+        entry = self.entries[rid]
+        entry.value = indices
+        return indices, entry
+    def insert(self):
+        raise NotImplementedError
+    def evict(self, num_tokens, evict_callback):
+        pass
+    def inc_lock_ref(self, node):
+        return 0
+    def dec_lock_ref(self, node):
+        return 0
+    def evictable_size(self):
+        return 0

sglang/srt/mem_cache/flush_cache.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""
+Flush the KV cache.
+Usage:
+python3 -m sglang.srt.mem_cache.flush_cache --url http://localhost:30000
+"""
+import argparse
+import requests
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--url", type=str, default="http://localhost:30000")
+    args = parser.parse_args()
+    response = requests.get(args.url + "/flush_cache")
+    assert response.status_code == 200

sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Memory pool."""
 import logging
@@ -30,7 +45,7 @@ class ReqToTokenPool:
         return select_index
-    def free(self, free_index: int):
+    def free(self, free_index):
         self.mem_state[free_index] = True
         if isinstance(free_index, (int,)):
             self.can_use_mem_size += 1

sglang/srt/{managers/controller → mem_cache}/radix_cache.py RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """
 The radix tree data structure for managing the KV cache.
 """
@@ -8,6 +23,8 @@ from collections import defaultdict
 import torch
+from sglang.srt.mem_cache.base_cache import BasePrefixCache
 class TreeNode:
     def __init__(self):
@@ -31,7 +48,7 @@ def _key_match(key0, key1):
     return i
-class RadixCache:
+class RadixCache(BasePrefixCache):
     def __init__(self, req_to_token_pool, token_to_kv_pool, disable: bool = False):
         self.req_to_token_pool = req_to_token_pool
         self.token_to_kv_pool = token_to_kv_pool
@@ -47,7 +64,7 @@ class RadixCache:
         self.root_node.lock_ref = 1
         self.evictable_size_ = 0
-    def match_prefix(self, key):
+    def match_prefix(self, key, **kwargs):
         if self.disable:
             return [], self.root_node
@@ -75,6 +92,7 @@ class RadixCache:
         req_pool_idx,
         del_in_memory_pool=True,
         old_last_node=None,
+        **kwargs,
     ):
         # Insert the request into radix cache
         indices = self.req_to_token_pool.req_to_token[req_pool_idx, : len(token_ids)]

sglang/srt/mm_utils.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 # Source: https://github.com/haotian-liu/LLaVA/blob/main/llava/mm_utils.py
 import ast
 import base64

sglang/srt/model_config.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 from typing import Optional
 from transformers import PretrainedConfig

sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Run the model with cuda graph."""
 import bisect
@@ -14,7 +29,7 @@ from sglang.srt.layers.logits_processor import (
     LogitsMetadata,
     LogitsProcessor,
 )
-from sglang.srt.managers.controller.infer_batch import (
+from sglang.srt.managers.schedule_batch import (
     Batch,
     ForwardMode,
     InputMetadata,

sglang 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

sglang 0.2.6py3-none-any.whl → 0.2.8py3-none-any.whl