PyPI - sglang - Versions diffs - 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

sglang 0.2.12py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

sglang/api.py +13 -1
sglang/bench_latency.py +10 -5
sglang/bench_serving.py +50 -26
sglang/check_env.py +15 -0
sglang/global_config.py +1 -1
sglang/lang/backend/runtime_endpoint.py +60 -49
sglang/lang/chat_template.py +10 -5
sglang/lang/compiler.py +4 -0
sglang/lang/interpreter.py +5 -2
sglang/lang/ir.py +22 -4
sglang/launch_server.py +8 -1
sglang/srt/constrained/jump_forward.py +13 -2
sglang/srt/conversation.py +50 -1
sglang/srt/hf_transformers_utils.py +22 -23
sglang/srt/layers/activation.py +24 -2
sglang/srt/layers/decode_attention.py +338 -50
sglang/srt/layers/extend_attention.py +3 -1
sglang/srt/layers/fused_moe/__init__.py +1 -0
sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
sglang/srt/layers/fused_moe/layer.py +587 -0
sglang/srt/layers/layernorm.py +3 -0
sglang/srt/layers/logits_processor.py +64 -27
sglang/srt/layers/radix_attention.py +41 -18
sglang/srt/layers/sampler.py +154 -0
sglang/srt/managers/controller_multi.py +2 -8
sglang/srt/managers/controller_single.py +7 -10
sglang/srt/managers/detokenizer_manager.py +20 -9
sglang/srt/managers/io_struct.py +44 -11
sglang/srt/managers/policy_scheduler.py +5 -2
sglang/srt/managers/schedule_batch.py +59 -179
sglang/srt/managers/tokenizer_manager.py +193 -84
sglang/srt/managers/tp_worker.py +131 -50
sglang/srt/mem_cache/memory_pool.py +82 -8
sglang/srt/mm_utils.py +79 -7
sglang/srt/model_executor/cuda_graph_runner.py +97 -28
sglang/srt/model_executor/forward_batch_info.py +188 -82
sglang/srt/model_executor/model_runner.py +269 -87
sglang/srt/models/chatglm.py +6 -14
sglang/srt/models/commandr.py +6 -2
sglang/srt/models/dbrx.py +5 -1
sglang/srt/models/deepseek.py +7 -3
sglang/srt/models/deepseek_v2.py +12 -7
sglang/srt/models/gemma.py +6 -2
sglang/srt/models/gemma2.py +22 -8
sglang/srt/models/gpt_bigcode.py +5 -1
sglang/srt/models/grok.py +66 -398
sglang/srt/models/internlm2.py +5 -1
sglang/srt/models/llama2.py +7 -3
sglang/srt/models/llama_classification.py +2 -2
sglang/srt/models/llama_embedding.py +4 -0
sglang/srt/models/llava.py +176 -59
sglang/srt/models/minicpm.py +7 -3
sglang/srt/models/mixtral.py +61 -255
sglang/srt/models/mixtral_quant.py +6 -5
sglang/srt/models/qwen.py +7 -4
sglang/srt/models/qwen2.py +15 -5
sglang/srt/models/qwen2_moe.py +7 -16
sglang/srt/models/stablelm.py +6 -2
sglang/srt/openai_api/adapter.py +149 -58
sglang/srt/sampling/sampling_batch_info.py +209 -0
sglang/srt/{sampling_params.py → sampling/sampling_params.py} +18 -4
sglang/srt/server.py +107 -71
sglang/srt/server_args.py +49 -15
sglang/srt/utils.py +27 -18
sglang/test/runners.py +38 -38
sglang/test/simple_eval_common.py +9 -10
sglang/test/simple_eval_gpqa.py +2 -1
sglang/test/simple_eval_humaneval.py +2 -2
sglang/test/simple_eval_math.py +2 -1
sglang/test/simple_eval_mmlu.py +2 -1
sglang/test/test_activation.py +55 -0
sglang/test/test_programs.py +32 -5
sglang/test/test_utils.py +37 -50
sglang/version.py +1 -1
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/METADATA +102 -27
sglang-0.2.14.dist-info/RECORD +114 -0
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/WHEEL +1 -1
sglang/launch_server_llavavid.py +0 -29
sglang/srt/model_loader/model_loader.py +0 -292
sglang/srt/model_loader/utils.py +0 -275
sglang-0.2.12.dist-info/RECORD +0 -112
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/LICENSE +0 -0
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/top_level.txt +0 -0

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -31,7 +31,7 @@ from sglang.global_config import global_config
 from sglang.srt.constrained.fsm_cache import FSMCache
 from sglang.srt.constrained.jump_forward import JumpForwardCache
 from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer
-from sglang.srt.layers.logits_processor import LogitProcessorOutput
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.io_struct import (
     AbortReq,
     BatchEmbeddingOut,
@@ -39,6 +39,8 @@ from sglang.srt.managers.io_struct import (
     FlushCacheReq,
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
+    UpdateWeightReqInput,
+    UpdateWeightReqOutput,
 )
 from sglang.srt.managers.policy_scheduler import PolicyScheduler, PrefillAdder
 from sglang.srt.managers.schedule_batch import (
@@ -54,7 +56,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import (
-    get_int_token_logit_bias,
+    configure_logger,
     is_multimodal_model,
     set_random_seed,
     suppress_other_loggers,
@@ -86,10 +88,6 @@ class ModelTpServer:
         self.schedule_policy = server_args.schedule_policy
         self.disable_regex_jump_forward = server_args.disable_regex_jump_forward
-        # Chunked prefill
-        self.chunked_prefill_size = server_args.chunked_prefill_size
-        self.current_inflight_req = None
         # Init model and tokenizer
         self.model_config = ModelConfig(
             server_args.model_path,
@@ -97,6 +95,7 @@ class ModelTpServer:
             context_length=server_args.context_length,
             model_overide_args=model_overide_args,
         )
         self.model_runner = ModelRunner(
             model_config=self.model_config,
             mem_fraction_static=server_args.mem_fraction_static,
@@ -132,18 +131,21 @@ class ModelTpServer:
             ),
             self.model_runner.req_to_token_pool.size - 1,
         )
-        self.int_token_logit_bias = torch.tensor(
-            get_int_token_logit_bias(self.tokenizer, self.model_config.vocab_size)
-        )
         self.max_req_input_len = min(
             self.model_config.context_len - 1,
             self.max_total_num_tokens - 1,
         )
+        # Sync random seed
+        server_args.random_seed = broadcast_recv_input(
+            [server_args.random_seed],
+            self.tp_rank,
+            self.model_runner.tp_group.cpu_group,
+        )[0]
         set_random_seed(server_args.random_seed)
         # Print info
         logger.info(
-            f"[gpu={self.gpu_id}] "
             f"max_total_num_tokens={self.max_total_num_tokens}, "
             f"max_prefill_tokens={self.max_prefill_tokens}, "
             f"max_running_requests={self.max_running_requests}, "
@@ -179,6 +181,13 @@ class ModelTpServer:
         self.num_generated_tokens = 0
         self.last_stats_tic = time.time()
+        # Chunked prefill
+        self.chunked_prefill_size = server_args.chunked_prefill_size
+        self.current_inflight_req = None
+        self.is_mixed_chunk = (
+            self.chunked_prefill_size is not None and server_args.enable_mixed_chunk
+        )
         # Init the FSM cache for constrained generation
         if not server_args.skip_tokenizer_init:
             self.regex_fsm_cache = FSMCache(
@@ -215,6 +224,9 @@ class ModelTpServer:
                     self.flush_cache()
                 elif isinstance(recv_req, AbortReq):
                     self.abort_request(recv_req)
+                elif isinstance(recv_req, UpdateWeightReqInput):
+                    success, message = self.update_weights(recv_req)
+                    self.out_pyobjs.append(UpdateWeightReqOutput(success, message))
                 else:
                     raise ValueError(f"Invalid request: {recv_req}")
@@ -272,7 +284,7 @@ class ModelTpServer:
         self.num_generated_tokens = 0
         self.last_stats_tic = time.time()
         logger.info(
-            f"[gpu={self.gpu_id}] Decode batch. "
+            f"Decode batch. "
             f"#running-req: {len(self.running_batch.reqs)}, "
             f"#token: {num_used}, "
             f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
@@ -311,11 +323,16 @@ class ModelTpServer:
         if self.model_runner.is_generation:
             req.pixel_values = recv_req.pixel_values
             if req.pixel_values is not None:
+                image_hash = (
+                    hash(tuple(recv_req.image_hash))
+                    if isinstance(recv_req.image_hash, list)
+                    else recv_req.image_hash
+                )
                 req.pad_value = [
-                    (recv_req.image_hash) % self.model_config.vocab_size,
-                    (recv_req.image_hash >> 16) % self.model_config.vocab_size,
-                    (recv_req.image_hash >> 32) % self.model_config.vocab_size,
-                    (recv_req.image_hash >> 64) % self.model_config.vocab_size,
+                    (image_hash) % self.model_config.vocab_size,
+                    (image_hash >> 16) % self.model_config.vocab_size,
+                    (image_hash >> 32) % self.model_config.vocab_size,
+                    (image_hash >> 64) % self.model_config.vocab_size,
                 ]
                 req.image_size = recv_req.image_size
                 (
@@ -370,11 +387,14 @@ class ModelTpServer:
         # Get priority queue
         prefix_computed = self.scheduler.calc_priority(self.waiting_queue)
+        num_mixed_running = running_bs if self.is_mixed_chunk else 0
         adder = PrefillAdder(
             self.tree_cache,
             self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size(),
             self.max_prefill_tokens,
             self.chunked_prefill_size,
+            num_mixed_running,
         )
         if self.running_batch is not None:
@@ -420,15 +440,27 @@ class ModelTpServer:
                 )
             else:
                 tree_cache_hit_rate = 0.0
-            logger.info(
-                f"[gpu={self.gpu_id}] Prefill batch. "
-                f"#new-seq: {len(can_run_list)}, "
-                f"#new-token: {adder.log_input_tokens}, "
-                f"#cached-token: {adder.log_hit_tokens}, "
-                f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
-                f"#running-req: {running_bs}, "
-                f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}"
-            )
+            if num_mixed_running > 0:
+                logger.info(
+                    f"Prefill batch"
+                    f"(mixed #running-req: {num_mixed_running}). "
+                    f"#new-seq: {len(can_run_list)}, "
+                    f"#new-token: {adder.log_input_tokens}, "
+                    f"#cached-token: {adder.log_hit_tokens}, "
+                    f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
+                    f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}"
+                )
+            else:
+                logger.info(
+                    f"Prefill batch. "
+                    f"#new-seq: {len(can_run_list)}, "
+                    f"#new-token: {adder.log_input_tokens}, "
+                    f"#cached-token: {adder.log_hit_tokens}, "
+                    f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
+                    f"#running-req: {running_bs}, "
+                    f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}"
+                )
         # Return the new batch
         new_batch = ScheduleBatch.init_new(
@@ -442,25 +474,41 @@ class ModelTpServer:
     def forward_prefill_batch(self, batch: ScheduleBatch):
         # Build batch tensors
-        batch.prepare_for_extend(
-            self.model_config.vocab_size, self.int_token_logit_bias
-        )
+        batch.prepare_for_extend(self.model_config.vocab_size)
+        decoding_reqs = []
+        if self.is_mixed_chunk and self.running_batch is not None:
+            self.running_batch.prepare_for_decode()
+            batch.mix_with_running(self.running_batch)
+            decoding_reqs = self.running_batch.reqs
+            self.running_batch = None
         if self.model_runner.is_generation:
             # Forward and sample the next tokens
             if batch.extend_num_tokens != 0:
-                output = self.model_runner.forward(batch, ForwardMode.EXTEND)
-                next_token_ids = batch.sample(output.next_token_logits)
+                sample_output, logits_output = self.model_runner.forward(
+                    batch, ForwardMode.EXTEND
+                )
+                next_token_ids = batch.check_sample_results(sample_output)
+                batch.sampling_info.penalizer_orchestrator.cumulate_output_tokens(
+                    next_token_ids
+                )
                 # Move logprobs to cpu
-                if output.next_token_logprobs is not None:
-                    output.next_token_logprobs = output.next_token_logprobs[
-                        torch.arange(len(next_token_ids), device=next_token_ids.device),
-                        next_token_ids,
-                    ].tolist()
-                    output.input_token_logprobs = output.input_token_logprobs.tolist()
-                    output.normalized_prompt_logprobs = (
-                        output.normalized_prompt_logprobs.tolist()
+                if logits_output.next_token_logprobs is not None:
+                    logits_output.next_token_logprobs = (
+                        logits_output.next_token_logprobs[
+                            torch.arange(
+                                len(next_token_ids), device=next_token_ids.device
+                            ),
+                            next_token_ids,
+                        ].tolist()
+                    )
+                    logits_output.input_token_logprobs = (
+                        logits_output.input_token_logprobs.tolist()
+                    )
+                    logits_output.normalized_prompt_logprobs = (
+                        logits_output.normalized_prompt_logprobs.tolist()
                     )
                 next_token_ids = next_token_ids.tolist()
@@ -483,9 +531,15 @@ class ModelTpServer:
                     req.output_ids.append(next_token_ids[i])
                     req.check_finished()
+                if req.regex_fsm is not None:
+                    req.regex_fsm_state = req.regex_fsm.get_next_state(
+                        req.regex_fsm_state, next_token_ids[i]
+                    )
                 if req.finished():
                     self.tree_cache.cache_finished_req(req)
-                else:
+                elif req not in decoding_reqs:
+                    # To reduce overhead, only cache prefill reqs
                     self.tree_cache.cache_unfinished_req(req)
                 if req is self.current_inflight_req:
@@ -493,12 +547,14 @@ class ModelTpServer:
                     self.req_to_token_pool.free(req.req_pool_idx)
                 if req.return_logprob:
-                    self.add_logprob_return_values(i, req, pt, next_token_ids, output)
+                    self.add_logprob_return_values(
+                        i, req, pt, next_token_ids, logits_output
+                    )
                     pt += req.extend_input_len
         else:
             assert batch.extend_num_tokens != 0
-            output = self.model_runner.forward(batch, ForwardMode.EXTEND)
-            embeddings = output.embeddings.tolist()
+            logits_output = self.model_runner.forward(batch, ForwardMode.EXTEND)
+            embeddings = logits_output.embeddings.tolist()
             # Check finish conditions
             for i, req in enumerate(batch.reqs):
@@ -526,7 +582,7 @@ class ModelTpServer:
         req: Req,
         pt: int,
         next_token_ids: List[int],
-        output: LogitProcessorOutput,
+        output: LogitsProcessorOutput,
     ):
         if req.normalized_prompt_logprob is None:
             req.normalized_prompt_logprob = output.normalized_prompt_logprobs[i]
@@ -585,7 +641,7 @@ class ModelTpServer:
             self.new_token_ratio = new_token_ratio
             logger.info(
-                "decode out of memory happened, "
+                "Decode out of memory happened. "
                 f"#retracted_reqs: {len(retracted_reqs)}, "
                 f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
             )
@@ -608,12 +664,17 @@ class ModelTpServer:
         batch.prepare_for_decode()
         # Forward and sample the next tokens
-        output = self.model_runner.forward(batch, ForwardMode.DECODE)
-        next_token_ids = batch.sample(output.next_token_logits)
+        sample_output, logits_output = self.model_runner.forward(
+            batch, ForwardMode.DECODE
+        )
+        next_token_ids = batch.check_sample_results(sample_output)
+        batch.sampling_info.penalizer_orchestrator.cumulate_output_tokens(
+            next_token_ids
+        )
         # Move logprobs to cpu
-        if output.next_token_logprobs is not None:
-            next_token_logprobs = output.next_token_logprobs[
+        if logits_output.next_token_logprobs is not None:
+            next_token_logprobs = logits_output.next_token_logprobs[
                 torch.arange(len(next_token_ids), device=next_token_ids.device),
                 next_token_ids,
             ].tolist()
@@ -626,6 +687,11 @@ class ModelTpServer:
             req.output_ids.append(next_token_id)
             req.check_finished()
+            if req.regex_fsm is not None:
+                req.regex_fsm_state = req.regex_fsm.get_next_state(
+                    req.regex_fsm_state, next_token_id
+                )
             if req.finished():
                 self.tree_cache.cache_finished_req(req)
@@ -634,7 +700,7 @@ class ModelTpServer:
                     (next_token_logprobs[i], next_token_id)
                 )
                 if req.top_logprobs_num > 0:
-                    req.output_top_logprobs.append(output.output_top_logprobs[i])
+                    req.output_top_logprobs.append(logits_output.output_top_logprobs[i])
         self.handle_finished_requests(batch)
@@ -749,12 +815,15 @@ class ModelTpServer:
             self.token_to_kv_pool.clear()
             torch.cuda.empty_cache()
             logger.info("Cache flushed successfully!")
+            if_success = True
         else:
-            warnings.warn(
+            logging.warning(
                 f"Cache not flushed because there are pending requests. "
                 f"#queue-req: {len(self.waiting_queue)}, "
                 f"#running-req: {0 if self.running_batch is None else len(self.running_batch.reqs)}"
             )
+            if_success = False
+        return if_success
     def abort_request(self, recv_req):
         # Delete requests in the waiting queue
@@ -774,6 +843,15 @@ class ModelTpServer:
                     req.finished_reason = FINISH_ABORT()
                     break
+    def update_weights(self, recv_req):
+        success, message = self.model_runner.update_weights(
+            recv_req.model_path, recv_req.load_format
+        )
+        if success:
+            flash_cache_success = self.flush_cache()
+            assert flash_cache_success, "Cache flush failed after updating weights"
+        return success, message
 def run_tp_server(
     gpu_id: int,
@@ -782,7 +860,9 @@ def run_tp_server(
     nccl_port: int,
     model_overide_args: dict,
 ):
-    """Run a tensor parallel server."""
+    """Run a tensor parallel model server."""
+    configure_logger(server_args, prefix=f" TP{tp_rank}")
     try:
         model_server = ModelTpServer(
             gpu_id,
@@ -838,6 +918,7 @@ def broadcast_recv_input(
             dist.broadcast(tensor_size, src=0, group=dist_group)
             dist.broadcast(tensor_data, src=0, group=dist_group)
+        return data
     else:
         tensor_size = torch.tensor([0], dtype=torch.long)
         dist.broadcast(tensor_size, src=0, group=dist_group)

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -16,7 +16,8 @@ limitations under the License.
 """Memory pool."""
 import logging
-from typing import List, Union
+from abc import ABC, abstractmethod
+from typing import List, Tuple, Union
 import torch
@@ -52,14 +53,21 @@ class ReqToTokenPool:
         self.free_slots = list(range(self.size))
-class BaseTokenToKVPool:
+class BaseTokenToKVPool(ABC):
     """A memory pool that maps a token to its kv cache locations"""
     def __init__(
         self,
         size: int,
+        dtype: torch.dtype,
     ):
         self.size = size
+        self.dtype = dtype
+        if dtype == torch.float8_e5m2:
+            # NOTE: Store as torch.uint8 because Tensor index_put is not implemented for torch.float8_e5m2
+            self.store_dtype = torch.uint8
+        else:
+            self.store_dtype = dtype
         # We also add one slot. This slot is used for writing dummy output from padded tokens.
         self.mem_state = torch.ones((self.size + 1,), dtype=torch.bool, device="cuda")
@@ -112,6 +120,28 @@ class BaseTokenToKVPool:
         # We also add one slot. This slot is used for writing dummy output from padded tokens.
         self.mem_state[0] = False
+    @abstractmethod
+    def get_key_buffer(self, layer_id: int) -> torch.Tensor:
+        raise NotImplementedError()
+    @abstractmethod
+    def get_value_buffer(self, layer_id: int) -> torch.Tensor:
+        raise NotImplementedError()
+    @abstractmethod
+    def get_kv_buffer(self, layer_id: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError()
+    @abstractmethod
+    def set_kv_buffer(
+        self,
+        layer_id: int,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+    ) -> None:
+        raise NotImplementedError()
 class MHATokenToKVPool(BaseTokenToKVPool):
@@ -123,26 +153,52 @@ class MHATokenToKVPool(BaseTokenToKVPool):
         head_dim: int,
         layer_num: int,
     ):
-        super().__init__(size)
+        super().__init__(size, dtype)
         # [size, head_num, head_dim] for each layer
         self.k_buffer = [
-            torch.empty((size + 1, head_num, head_dim), dtype=dtype, device="cuda")
+            torch.empty(
+                (size + 1, head_num, head_dim), dtype=self.store_dtype, device="cuda"
+            )
             for _ in range(layer_num)
         ]
         self.v_buffer = [
-            torch.empty((size + 1, head_num, head_dim), dtype=dtype, device="cuda")
+            torch.empty(
+                (size + 1, head_num, head_dim), dtype=self.store_dtype, device="cuda"
+            )
             for _ in range(layer_num)
         ]
     def get_key_buffer(self, layer_id: int):
+        if self.store_dtype != self.dtype:
+            return self.k_buffer[layer_id].view(self.dtype)
         return self.k_buffer[layer_id]
     def get_value_buffer(self, layer_id: int):
+        if self.store_dtype != self.dtype:
+            return self.v_buffer[layer_id].view(self.dtype)
         return self.v_buffer[layer_id]
     def get_kv_buffer(self, layer_id: int):
-        return self.k_buffer[layer_id], self.v_buffer[layer_id]
+        return self.get_key_buffer(layer_id), self.get_value_buffer(layer_id)
+    def set_kv_buffer(
+        self,
+        layer_id: int,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+    ):
+        if cache_k.dtype != self.dtype:
+            cache_k = cache_k.to(self.dtype)
+        if cache_v.dtype != self.dtype:
+            cache_v = cache_v.to(self.dtype)
+        if self.store_dtype != self.dtype:
+            self.k_buffer[layer_id][loc] = cache_k.view(self.store_dtype)
+            self.v_buffer[layer_id][loc] = cache_v.view(self.store_dtype)
+        else:
+            self.k_buffer[layer_id][loc] = cache_k
+            self.v_buffer[layer_id][loc] = cache_v
 class MLATokenToKVPool(BaseTokenToKVPool):
@@ -155,23 +211,41 @@ class MLATokenToKVPool(BaseTokenToKVPool):
         qk_rope_head_dim: int,
         layer_num: int,
     ):
-        super().__init__(size)
+        super().__init__(size, dtype)
         self.kv_lora_rank = kv_lora_rank
         self.kv_buffer = [
             torch.empty(
                 (size + 1, 1, kv_lora_rank + qk_rope_head_dim),
-                dtype=dtype,
+                dtype=self.store_dtype,
                 device="cuda",
             )
             for _ in range(layer_num)
         ]
     def get_key_buffer(self, layer_id: int):
+        if self.store_dtype != self.dtype:
+            return self.kv_buffer[layer_id].view(self.dtype)
         return self.kv_buffer[layer_id]
     def get_value_buffer(self, layer_id: int):
+        if self.store_dtype != self.dtype:
+            return self.kv_buffer[layer_id][..., : self.kv_lora_rank].view(self.dtype)
         return self.kv_buffer[layer_id][..., : self.kv_lora_rank]
     def get_kv_buffer(self, layer_id: int):
         return self.get_key_buffer(layer_id), self.get_value_buffer(layer_id)
+    def set_kv_buffer(
+        self,
+        layer_id: int,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+    ):
+        if cache_k.dtype != self.dtype:
+            cache_k = cache_k.to(self.dtype)
+        if self.store_dtype != self.dtype:
+            self.kv_buffer[layer_id][loc] = cache_k.view(self.store_dtype)
+        else:
+            self.kv_buffer[layer_id][loc] = cache_k

sglang/srt/mm_utils.py CHANGED Viewed

@@ -13,10 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-# Source: https://github.com/haotian-liu/LLaVA/blob/main/llava/mm_utils.py
+# Source: https://github.com/LLaVA-VL/LLaVA-NeXT/blob/main/llava/mm_utils.py
+"""
+Utilities for multi-modal models.
+This python file mainly contains utilities that were used in the
+image processing logic of llava-next including operations such as
+anyres and anyres_max
+Currently supports the anyres and anyres_max operation for CLIP and
+SigLip. For more information, you may refer to the paper or the blog
+LLaVA-NeXT : https://llava-vl.github.io/blog/2024-01-30-llava-next/
+LLaVA-Onevision : https://arxiv.org/pdf/2408.03326
+"""
 import ast
 import base64
 import math
+import re
 from io import BytesIO
 import numpy as np
@@ -40,10 +55,13 @@ def select_best_resolution(original_size, possible_resolutions):
     min_wasted_resolution = float("inf")
     for width, height in possible_resolutions:
+        # Calculate the downscaled size to keep the aspect ratio
         scale = min(width / original_width, height / original_height)
         downscaled_width, downscaled_height = int(original_width * scale), int(
             original_height * scale
         )
+        # Calculate effective and wasted resolutions
         effective_resolution = min(
             downscaled_width * downscaled_height, original_width * original_height
         )
@@ -129,6 +147,26 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
     Returns:
         tuple: The shape of the image patch grid in the format (width, height).
     """
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        assert patch_size in [
+            224,
+            336,
+            384,
+            448,
+            512,
+        ], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [
+            (i, j)
+            for i in range(range_start[0], range_end[0] + 1)
+            for j in range(range_start[1], range_end[1] + 1)
+        ]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
     if type(grid_pinpoints) is list:
         possible_resolutions = grid_pinpoints
     else:
@@ -149,6 +187,31 @@ def process_anyres_image(image, processor, grid_pinpoints):
     Returns:
         np.array: An np array containing the processed image patches.
     """
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        try:
+            patch_size = processor.size[0]
+        except Exception as e:
+            patch_size = processor.size["shortest_edge"]
+        assert patch_size in [
+            224,
+            336,
+            384,
+            448,
+            512,
+        ], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [
+            (i, j)
+            for i in range(range_start[0], range_end[0] + 1)
+            for j in range(range_start[1], range_end[1] + 1)
+        ]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
     if type(grid_pinpoints) is list:
         possible_resolutions = grid_pinpoints
     else:
@@ -156,15 +219,24 @@ def process_anyres_image(image, processor, grid_pinpoints):
     best_resolution = select_best_resolution(image.size, possible_resolutions)
     image_padded = resize_and_pad_image(image, best_resolution)
-    patches = divide_to_patches(image_padded, processor.crop_size["height"])
-    image_original_resize = image.resize(
-        (processor.size["shortest_edge"], processor.size["shortest_edge"])
+    # For Siglip processor, only have size but no crop size
+    crop_size = (
+        processor.crop_size["height"]
+        if "crop_size" in processor.__dict__
+        else processor.size["height"]
     )
+    shortest_edge = (
+        processor.size["shortest_edge"]
+        if "shortest_edge" in processor.size
+        else processor.size["height"]
+    )
+    patches = divide_to_patches(image_padded, crop_size)
+    image_original_resize = image.resize((shortest_edge, shortest_edge))
     image_patches = [image_original_resize] + patches
     image_patches = [
-        processor.preprocess(image_patch)["pixel_values"][0]
+        processor.preprocess(image_patch.convert("RGB"))["pixel_values"][0]
         for image_patch in image_patches
     ]
     return np.stack(image_patches, axis=0)
@@ -255,7 +327,7 @@ def process_images(images, image_processor, model_cfg):
             )
             image = image_processor.preprocess(image)["pixel_values"][0]
             new_images.append(image)
-    elif image_aspect_ratio == "anyres":
+    elif "anyres" in image_aspect_ratio:
         for image in images:
             image = process_anyres_image(
                 image, image_processor, model_cfg.image_grid_pinpoints

sglang 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl

sglang 0.2.12py3-none-any.whl → 0.2.14py3-none-any.whl